In [None]:
import numpy as np
import pandas as pd
import rasterio
import earthpy.plot as ep
from sklearn.preprocessing import StandardScaler
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import from_levels_and_colors

## Loading the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
year = input("Year: ")
area = input("Area: ")

In [None]:
data_path = f'/content/drive/MyDrive/DL - data/raw data/{area}_{year}.csv'
img_path = f'/content/drive/MyDrive/DL - data/raw data/{area}_{year}.tif'

In [None]:
df = pd.read_csv(data_path)
df.head()

In [None]:
df.columns

## Visualization

In [None]:
# Load image
image = rasterio.open(img_path)
print(image)
bandNum = image.count
height = image.height
width = image.width
crs = image.crs
transform = image.transform
shape = (height, width)

image_vis = []
for x in [4, 3, 2]:
  image_vis.append(image.read(x))
image_vis = np.stack(image_vis)

plot_size = (8, 8)
ep.plot_rgb(
  image_vis,
  figsize=plot_size,
  stretch=True,
)

## Processing the data

### We will first drop the columns that are of no relevance for training a model

In [None]:
import copy

In [None]:
df_copy = copy.deepcopy(df)

In [None]:
df = df_copy.drop(columns=['system:index','.geo'])

In [None]:
df = df_copy.drop(columns=['Class'])

In [None]:
# Create subplots
fig, axes = plt.subplots(5, 4, figsize=(20, 4 * 5))
axes = axes.flatten()  # Flatten the 2D array of axes for easy indexing

# Loop through each feature and plot a histogram

for i, column in enumerate(df_copy.columns):
    axes[i].hist(df_copy[column], bins=100, color='blue', alpha=0.7)
    axes[i].set_title(f"Band - {column}")
    axes[i].set_xlabel("Values")
    axes[i].set_ylabel("Frequency")

# Hide any unused subplots (if there are fewer than 20 features)
for j in range(i + 1, 5 * 4):
    axes[j].axis('off')  # Turn off axes for unused subplots

# Adjust layout for better spacing
plt.tight_layout()
plt.savefig(f'/content/drive/MyDrive/Results/CNN/band_frequency_visualization.png', dpi=600)
plt.show()

### Splitting the dataset into train and test based on the sample column

In [None]:
print(df.drop(columns=["sample"]).mean().round(3))

In [None]:
print(df.drop(columns=["sample"]).std().round(3))

In [None]:
train_df = df[df['sample'] == 'train']  # DataFrame for "train"
test_df = df[df['sample'] == 'test']    # DataFrame for "test"

In [None]:
train_df.head()

In [None]:
test_df.head()

### Drop the sample column

In [None]:
train_df = train_df.drop(columns=["sample"])
test_df = test_df.drop(columns=["sample"])

In [None]:
train_df.columns

In [None]:
test_df.columns

### Normalization

In [None]:
columns_to_standardize = ['DVI','SR_B2_contrast', 'SR_B2_diss',
       'SR_B2_mean', 'SR_B2_var', 'SR_B3_contrast', 'SR_B3_diss', 'SR_B3_mean',
       'SR_B3_var', 'SR_B4_contrast', 'SR_B4_diss', 'SR_B4_mean', 'SR_B4_var',
       'SR_B5_contrast', 'SR_B5_diss', 'SR_B5_mean', 'SR_B5_var']

In [None]:
# Initialize the scaler
scaler = StandardScaler()

In [None]:
# Scale both training and testing data
train_df[columns_to_standardize] = scaler.fit_transform(train_df[columns_to_standardize])
test_df[columns_to_standardize] = scaler.transform(test_df[columns_to_standardize])

In [None]:
train_df.head()

## Saving the training and testing data

In [None]:
train_df.to_csv(f'/content/drive/MyDrive/processed_data/training_processed_{area}_{year}.csv')
test_df.to_csv(f'/content/drive/MyDrive/processed_data/testing_processed_{area}_{year}.csv')