# IMPORT

In [None]:
#Data handling & basic libraries
import numpy as np
from numpy import ma
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import joblib

#Train test split
from sklearn.model_selection import train_test_split

#Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Evaluation
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

#Gridsearch
from sklearn.model_selection import StratifiedKFold

#Models
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#Visualization
#import matplotlib.pyplot as plt

#Tensorflow
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint

import random

# Load data & subset by labels

Vi loader data, dataset subsettes ved længden af labels og vi arrangerer farvebånd til RGB

In [None]:
# Fields
# ccFieldsAll = np.load("C:/Users/mkoli/Syddansk Universitet/Morten Thyrring Stouenberg - Speciale2023/Data/AllNoAlt/ccLastImageNoCloudIndex0_6234.npy", allow_pickle=True)
ccFieldsAll = np.load("C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Data/AllNoAlt/ccLastImageNoCloudIndex0_6234.npy", allow_pickle=True)

# Labels
# labels = np.load("C:/Users/mkoli/Syddansk Universitet/Morten Thyrring Stouenberg - Speciale2023/Data/AllNoAlt/reLabelsAll.npy", allow_pickle=True)
labels = np.load("C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Data/AllNoAlt/reLabelsAll.npy", allow_pickle=True)

# Cloud probability
cloud_probability = ccFieldsAll[:len(labels),:,:,7]

# Subset data to true labels
ccFieldsAll = ccFieldsAll[:len(labels),:,:,[2,1,0]]

* Cloud masking

In [None]:
# We Set cloud tolerance, between 0-255
cloud_tolerance = 40
    
# We loop in the fields by the length of the dataset
for i in range(len(ccFieldsAll)):
    # we subset the image we are at in the loop
    image = ccFieldsAll[i,:,:,:]    
    # We create and apply a mask based on cloud probability over a certain value
    # We use np.logical because np.where did not work for me at the moment
    mask = np.logical_and(cloud_probability[i, :, :,] >= cloud_tolerance, cloud_probability[i, :, :] <= 256)
    # We set all masked pixels to zero, to black out cloud covered parts
    # I do not know why but it didn't work when i did not make a copy. It is just one of those times. 
    masked = image.copy()
    masked[mask] = 0
    # We are replacing the current image with the one with applied cloud mask, if no clouds then nothing is removed.
    ccFieldsAll[i] = masked

* Delete empty images

In [None]:
# we want to delete the all black images, this is done by only appending images with visible RGB bands not all black.
# Initiate list for images that did not get blacked out as well as a list for labels
ccFieldsNoCloud = []
LabelsNoCloud = []

# We loop in all the images
for i in range(len(ccFieldsAll)):
    # if the image is not all black
    if not np.all(ccFieldsAll[i] == 0):
        #we append the image and label
        ccFieldsNoCloud.append(ccFieldsAll[i])
        LabelsNoCloud.append(labels[i])


# We set the lists as arrays
ccFieldsNoCloud = np.array(ccFieldsNoCloud)
LabelsNoCloud   = np.array(LabelsNoCloud)

#And print the shapes to control the output and see how many fields are left.
print(ccFieldsNoCloud.shape)
print(LabelsNoCloud)

# Resize images

In [None]:
# We select 64x64 as image resolution since this is the best performing models measured on accuracy for both Random Forest, SVC and XGboost
#we use tensorflows image.resize function for resizing. 64x64 is the chosen size based on performance of models.

size = 64

ccFieldsResized = []
for i in ccFieldsAll:
    resized_image = tf.image.resize(i, [size, size])
    ccFieldsResized.append(resized_image)
ccFieldsResized = np.stack(ccFieldsResized)

In [None]:
plt.figure(figsize=(20,30))
for i in range(50):
    plt.subplot(10,5,i+1)
    plt.imshow(3.5* ccFieldsResized[i] / 10000)
    plt.title(f'\n idx: {i}, label: {labels[i]}')
plt.show()

# Flatten data into 2d

In [None]:
# To flatten data into 2dimensional data we first obtain the number of observations
num_samples = ccFieldsResized.shape[0]
# We obtain the image size as the height, width and channels
image_size = ccFieldsResized.shape[1] * ccFieldsResized.shape[2] * ccFieldsResized.shape[3]
# we use numpys reshape function and utilize the information obtained in the previous steps for flattening
ccFieldsResized_2d = np.reshape(ccFieldsResized, (num_samples, image_size))

ccFieldsResized_2d.shape

# Train Test Split

In [None]:
randomState = 42

X_train, X_test, y_train, y_test = train_test_split(ccFieldsResized_2d, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=randomState)

# Removed because we uses cross validation grid search
# X_train, X_val, y_train, y_val = train_test_split(X_train, 
#                                                   y_train, 
#                                                   test_size=0.25, 
#                                                   random_state=1) # 0.25 x 0.8 = 0.2

print(X_train.shape, y_train.shape) 
print(X_test.shape, y_test.shape)
# print(X_val.shape, y_val.shape)

# Labels distribution

In [None]:
print(f'Number of observations : {len(LabelsNoCloud)}')
print(f'appearances of 0 in labels : {np.count_nonzero(LabelsNoCloud == 0)}({round(100*np.count_nonzero(LabelsNoCloud == 0)/len(LabelsNoCloud),2)}% of total)')
print(f'appearances of 1 in labels : {np.count_nonzero(LabelsNoCloud == 1)}({round(100*np.count_nonzero(LabelsNoCloud == 1)/len(LabelsNoCloud),2)}% of total)')
print(f'appearances of 2 in labels : {np.count_nonzero(LabelsNoCloud == 2)}({round(100*np.count_nonzero(LabelsNoCloud == 2)/len(LabelsNoCloud),2)}% of total)')

# Random Forest

* **Base model**

In [None]:
# A base model is initiated
baseRF = RandomForestClassifier(random_state=randomState,
                                n_jobs=-1)

In [None]:
# The base model is fit on the training data 
baseRF.fit(X_train, y_train)
# Base model presictions are made
y_pred = baseRF.predict(X_test)
# A classification report is printed based on the predictions of the model
print(classification_report(y_test, y_pred))

* **Gridsearch & optimal model**

In [None]:
#A base model is initiated for a gridsearch approach
clfRF = RandomForestClassifier(random_state=randomState,
                               n_jobs=-1)

In [None]:
# defining parameter range for gridsearch
n_estimators = [100, 200, 300, 500, 800, 1000, 1500]                  # number of trees in the random forest (100)
max_features = ['log2', 'sqrt', 0.1, None]           # number of features in consideration at every split (sqrt)
max_depth = [5, 30, 50, 70, None]    # maximum number of levels allowed in each decision tree (None)
min_samples_split = [2, 5, 10, 15]              # minimum sample number to split a node (2)
min_samples_leaf = [1, 2, 5, 10]                    # minimum sample number that can be stored in a leaf node (1)

# defining parameter range for gridsearch
n_estimators = [200, 300, 400, 800]                  # number of trees in the random forest (100)
max_features = ['sqrt', 0.1]           # number of features in consideration at every split (sqrt)
max_depth = [30, 50, 70, None]                   # maximum number of levels allowed in each decision tree (None)
min_samples_split = [2]              # minimum sample number to split a node (2)
min_samples_leaf = [2, 5, 10]                    # minimum sample number that can be stored in a leaf node (1)

param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
    }

In [None]:
# For the gridsearch, the base estimator clfRF is utilized with the parameter grid specified above. 
gridSearchRF = GridSearchCV(estimator = clfRF, 
                            param_grid = param_grid,
                            refit = True,
                            verbose = 3)

In [None]:
# The gridsearch of the estimator is fit to go through the different combinations of hyperparameters
gridSearchRF.fit(X_train, y_train)

In [None]:
# print how our model looks after hyper-parameter tuning
print('\n Best estimator:')
print(gridSearchRF.best_estimator_)

# print mean cross-validated score of the best model
print('/n Best score:')
print(gridSearchRF.best_score_)

# print best parameter after tuning
print('\n Best parameters:')
print(gridSearchRF.best_params_)

# print top ten results of gridsearch
print('\n Top ten results:')
resultsRF = pd.DataFrame(gridSearchRF.cv_results_)
resultsRF.sort_values(by='rank_test_score', inplace=True)
resultsRF = resultsRF.head(30)
display(resultsRF)


In [None]:
# save the model
fPath = 'C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/RF/gridSearchRF_Run2.sav'

joblib.dump(gridSearchRF, open(fPath, 'wb'))

In [None]:
# load the model
fPath = 'C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/RF/gridSearchRF_Run2.sav'
clfRFLoaded = joblib.load(open(fPath, 'rb'))

In [None]:
# print how our model looks after hyper-parameter tuning
print('\n Best estimator:')
print(clfRFLoaded.best_estimator_)

# print mean cross-validated score of the best model
print('/n Best score:')
print(clfRFLoaded.best_score_)

# print best parameter after tuning
print('\n Best parameters:')
print(clfRFLoaded.best_params_)

# print top ten results of gridsearch
print('\n Top ten results:')
resultsRF = pd.DataFrame(clfRFLoaded.cv_results_)
resultsRF.sort_values(by='rank_test_score', inplace=True)
resultsRF.to_csv('C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/RF/gridSearchRF_Run2.csv')  
resultsRF = resultsRF.head(30)
display(resultsRF)

In [None]:
# Generate predictions on the test set
y_pred = clfRFLoaded.predict(X_test)

print(classification_report(y_test, y_pred))

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define the class labels
class_names = ['Class 0', 'Class 1', 'Class 2']

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, cmap=plt.cm.Reds)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.xticks(np.arange(len(class_names)), class_names, rotation=45)
plt.yticks(np.arange(len(class_names)), class_names)
plt.colorbar()

# Add labels to the plot
for i in range(len(class_names)):
    for j in range(len(class_names)):
        plt.text(j, i, cm[i, j], ha='center', va='center', color='black')

plt.show()

# XGBOOST

* **Base model**

In [None]:
baseXGB = XGBClassifier(objective='multi:softmax',
                        num_class = 3,
                        n_jobs = -1)

In [None]:
baseXGB.fit(X_train, y_train)
y_pred = baseXGB.predict(X_test)
print(classification_report(y_test, y_pred))

* **Gridsearch & optimal model**

In [None]:
# defining parameter range for gridsearch
# Values from text
learning_rate = [0.0001, 0.001, 0.01, 0.1] #2   # Step size shrinkage used in update to prevents overfitting (0.1). After each boosting step, we can directly get the weights of new features, and learning_rate shrinks the feature weights to make the boosting process more conservative.
learning_rate = [0.001, 0.01]
n_estimators = [100, 300, 500, 800, 1000, 1500] #3          # number of trees in the random forest (100)
n_estimators = [800, 1000] 
min_child_weight = [1, 5, 10]             # Minimum sum of instance weight (hessian) needed in a child (1). If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression task, this simply corresponds to minimum number of instances needed to be in each node. The larger min_child_weight is, the more conservative the algorithm will be.
min_child_weight = [1, 5]
subsample = [0.6, 0.8, 1.0]               # Subsample ratio of the training instances (1). Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. Subsampling will occur once in every boosting iteration.
subsample = [0.6, 0.8, 1.0]
colsample_bytree = [0.6, 0.8, 1.0]        # colsample_bytree is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
colsample_bytree = [0.01, 0.1]
max_depth = [3, 4, 5]                     # Maximum depth of a tree (6). Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. Beware that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.
max_depth = [6, 10]

params_grid = {
   'learning_rate' : learning_rate,
   'n_estimators': n_estimators,
   'min_child_weight': min_child_weight,
   'gamma': gamma,
   'subsample': subsample,
   'colsample_bytree': colsample_bytree,
   'max_depth': max_depth
   }

In [None]:
# Create classifier
clfXGB = XGBClassifier(
    objective='multi:softmax',
    num_class = 3,
    n_jobs = -1
    )

In [None]:
# Create Grid Search
gridSearchXGB = GridSearchCV(
    estimator = clfXGB, 
    param_grid = params_grid, 
    refit = True, 
    verbose = 3
    )

gridSearchXGB.fit(X_train, y_train)

In [None]:
# print how our model looks after hyper-parameter tuning
print('\n Best estimator:')
print(gridSearchXGB.best_estimator_)

# print mean cross-validated score of the best model
print('/n Best score:')
print(gridSearchXGB.best_score_)

# print best parameter after tuning
print('\n Best parameters:')
print(gridSearchXGB.best_params_)

# print top ten results of gridsearch
print('\n Top ten results:')
resultsXGB = pd.DataFrame(gridSearchXGB.cv_results_)
resultsXGB.sort_values(by='rank_test_score', inplace=True)
resultsXGB = resultsXGB.head(30)
display(resultsXGB)

In [None]:
# save the model
fPath = 'C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/XGB/gridSearchXGB_Run3.sav'
joblib.dump(gridSearchXGB, open(fPath, 'wb'))

In [None]:
# load the model
fPath = 'C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/XGB/gridSearchXGB_Run3.sav'
clfXGBLoaded = joblib.load(open(fPath, 'rb'))

In [None]:
# print how our model looks after hyper-parameter tuning
print('\n Best estimator:')
print(clfXGBLoaded.best_estimator_)

# print mean cross-validated score of the best model
print('/n Best score:')
print(clfXGBLoaded.best_score_)

# print best parameter after tuning
print('\n Best parameters:')
print(clfXGBLoaded.best_params_)

# print top ten results of gridsearch
print('\n Top ten results:')
resultsXGB = pd.DataFrame(clfXGBLoaded.cv_results_)
resultsXGB.sort_values(by='rank_test_score', inplace=True)
resultsXGB.to_csv('C:/Users/morte/OneDrive - Syddansk Universitet/Speciale2023/Models/XGB/gridSearchXGB_Run3.csv')  
resultsXGB = resultsXGB.head(30)
display(resultsXGB)

In [None]:
# Generate predictions on the test set
y_pred = clfXGBLoaded.predict(X_test)

print(classification_report(y_test, y_pred))

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define the class labels
class_names = ['Class 0', 'Class 1', 'Class 2']

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm, cmap=plt.cm.Reds)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.xticks(np.arange(len(class_names)), class_names, rotation=45)
plt.yticks(np.arange(len(class_names)), class_names)
plt.colorbar()

# Add labels to the plot
for i in range(len(class_names)):
    for j in range(len(class_names)):
        plt.text(j, i, cm[i, j], ha='center', va='center', color='black')

plt.show()

# Determine resolution size

For determining the resolution size of the images, we run the different image sizes in an out of bag Random Forest classifier and compare the results. 

The steps utilized are the same as described in the models above.

In [None]:
resolutionList = [8, 16, 32, 64, 128, 256]

clfRF = RandomForestClassifier(random_state=randomState,
                               n_jobs=-1)
                               
clfXGB = XGBClassifier(objective='multi:softmax',
                        num_class = 3,
                        n_jobs = -1)

for item in resolutionList:
    size = item

    ccFieldsResized = []
    for i in ccFieldsAll:
        resized_image = tf.image.resize(i, [size, size])
        ccFieldsResized.append(resized_image)
    ccFieldsResized = np.stack(ccFieldsResized)

    num_samples = ccFieldsResized.shape[0]
    image_size = ccFieldsResized.shape[1] * ccFieldsResized.shape[2] * ccFieldsResized.shape[3]
    ccFieldsResized_2d = np.reshape(ccFieldsResized, (num_samples, image_size))

    ccFieldsResized_2d.shape

    X_train, X_test, y_train, y_test = train_test_split(ccFieldsResized_2d, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=randomState)

    print(X_train.shape, y_train.shape) 
    print(X_test.shape, y_test.shape)

    clfRF.fit(X_train, y_train)
    y_pred = clfRF.predict(X_test)
    print(f'Classification report Random Forest for resolution: {item}')
    print(classification_report(y_test, y_pred))

    clfXGB.fit(X_train, y_train)
    y_pred = clfXGB.predict(X_test)
    print(f'Classification report XGBoost for resolution: {item}')
    print(classification_report(y_test, y_pred))

# Data augmentation

This section should be run before the classifier sections to train the models on augmented data.

In [None]:
#Defining augmentation method 
 
datagen = ImageDataGenerator(
    rotation_range      = 360,        # Vi roterer billedet random
    zoom_range          = 0.2,           # Zoomer random
    brightness_range    = (0.8,1.2),    # Brightness range is random between 0.8 and 1.2 , 1 is the original brightness.  
    horizontal_flip     =True,
    vertical_flip       =False,
    fill_mode='nearest')



In [None]:
data_for_augmentation = []
labels_for_augmentation = []

#Loop data and append fields to each class for single class data augmentation 
for i in range(len(y_train)):
  if y_train[i] == 0:
    data_for_augmentation.append(X_train[i])
    labels_for_augmentation.append(y_train[i])
  if y_train[i] == 1:
    data_for_augmentation.append(X_train[i])
    labels_for_augmentation.append(y_train[i])

# save as numpy arrays
data_for_augmentation    = np.asarray(data_for_augmentation)
labels_for_augmentation    = np.asarray(labels_for_augmentation)

# Check  array shape
print(f'data for augmentation : {data_for_augmentation.shape} labels for augmentation : {labels_for_augmentation.shape}')

In [None]:
# Her sætter vi antallet af augmenterede billeder per billede
num_samples_per_image = 7
# num_samples_per_image = 9
# num_samples_per_image = 11

# Reshaping data
aug_dataset            = data_for_augmentation.reshape(data_for_augmentation.shape[0], 
                                                      data_for_augmentation.shape[1], 
                                                      data_for_augmentation.shape[2], 
                                                      3)

# Initiate list of labels for augmented data
labels_aug = []

#tomt array der passer til det vi gerne vil stoppe i
augmented_images = []

#index counter som sætter index i nyt array for append
index_counter = 0

for i in range(len(aug_dataset)):
    for j in range(num_samples_per_image):
        img = data_for_augmentation[i]
        img = np.expand_dims(img, axis=0)
        augmented_img = datagen.flow(img, batch_size=1, shuffle=False).next()
        augmented_img = augmented_img.squeeze(axis=0)
        augmented_images.append(augmented_img)
        labels_aug.append(labels_for_augmentation[i])

labels_aug=np.asarray(labels_aug)
augmented_images=np.asarray(augmented_images)

In [None]:
X_train = np.concatenate((X_train, augmented_images), axis=0)
y_train = np.concatenate((y_train, labels_aug), axis=0)

In [None]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [None]:
print(f'Number of observations : {len(y_train)}')

print(f'appearances of 0 in labels : {np.count_nonzero(y_train == 0)}({round(100*np.count_nonzero(y_train == 0)/len(y_train),2)}% of total)')
print(f'appearances of 1 in labels : {np.count_nonzero(y_train == 1)}({round(100*np.count_nonzero(y_train == 1)/len(y_train),2)}% of total)')
print(f'appearances of 2 in labels : {np.count_nonzero(y_train == 2)}({round(100*np.count_nonzero(y_train == 2)/len(y_train),2)}% of total)')

### Flatten data into 2d

In [None]:
num_samples = X_train.shape[0]
image_size = X_train.shape[1] * X_train.shape[2] * X_train.shape[3]
X_train = np.reshape(X_train, (num_samples, image_size))

print(X_train.shape)

num_samples = X_test.shape[0]
image_size = X_test.shape[1] * X_test.shape[2] * X_test.shape[3]
X_test = np.reshape(X_test, (num_samples, image_size))

print(X_test.shape)