In [None]:
import rasterio
import numpy as np
import pandas as pd

# Read the driving factors rasters:


In [None]:
DEM_factors = ['aspect', 'slope', 'elevation']

climate_factors = ['soil_type', '10m_u_component_of_wind', '10m_v_component_of_wind', '2m_temperature', 'evaporation_from_vegetation_transpiration',
             'runoff', 'soil_temperature_level_1',   'total_precipitation'
                , 'volumetric_soil_water_layer_1','total_evaporation',]

population_factors = ['population_density_GPWv4']

nb_factors = len(climate_factors) + len(DEM_factors)  + len(population_factors)

nb_years, nb_pixels_height, nb_pixels_width = 21, 126, 229
no_data = 255
years = np.array([2002+i for i in range(nb_years)])


factors = np.ndarray((nb_factors,nb_years,nb_pixels_height,nb_pixels_width))


c_f = 0

for i,factor in enumerate(DEM_factors):
    src = rasterio.open(f'./rasters/driving_factors/raster-DEM-{factor}_5km.tif')
    arr = src.read(1)
    for j in range(21):
        factors[c_f + i][j] = arr
    
c_f += len(DEM_factors)

for i,factor in enumerate(climate_factors):
    src = rasterio.open(f'./rasters/driving_factors/raster-{factor}_5km.tif')
    arr = src.read()
    factors[c_f + i] = arr
       
c_f += len(climate_factors)

for i,factor in enumerate(population_factors):
    src = rasterio.open(f'./rasters/driving_factors/raster-{factor}_5km.tif')
    arr = src.read()
    for j in range(21):
        factors[c_f + i][j] = arr[(j + 2002)//5 - 400]
c_f += len(population_factors)
    

# Read the Regionalization Rasters

In [None]:
regions =  np.ndarray((4, nb_pixels_height,nb_pixels_width))
for i in range(5, 9):
    src = rasterio.open(f"./rasters/driving_factors/Skater_results/SkReg{i}.tif")
    regions[i-5] = src.read(1)


## Create a dataset file using the driving factors
### For the climate factors, compute the average values of each 5 years period

In [None]:
columns =   climate_factors + DEM_factors + population_factors + ['regions']

# get coordinates where it contains a value 
coordinates = np.argwhere(regions[0] > 0)

#create a dataset with all the driving factors
dataset = np.ndarray((len(coordinates), len(DEM_factors) + 1 + 4*(len(climate_factors) - 1)  + len(population_factors)*5 + 4) )

# periods to compute the averages
frontiers = [0,5,10,15,22]
for i,c in enumerate(coordinates):
    
    # for DEM and soil type
    for j in range(len(DEM_factors) + 1):
        dataset[i][j] = factors[j][0][c[0]][c[1]]
    idx = len(DEM_factors) + 1
    for j in range(len(climate_factors)-1):
        l = []
        for y in range(len(years)):
            l.append(factors[idx+j][y][c[0]][c[1]])
            
        # to have the average of the different slices [2002 to 2006],[2007 to 2011],[2012 to 2016], [2017 to 2022]
        for k in range(len(frontiers) - 1):
            dataset[i][idx + j*4+ k] = np.average(l[frontiers[k]:frontiers[k+1]]) 
        
    idx += (len(climate_factors)-1)*4
    for j in range(5):
        dataset[i][j+idx] = factors[len(climate_factors) + len(DEM_factors)][j*5][c[0]][c[1]]
    idx += 5
    # target
    for j in range(4):
        dataset[i][idx+j] = regions[j][c[0]][c[1]]
        
    

        
        

In [None]:
## Create the columns list 

In [None]:
climate_factors_ext = ['soil_type']
for j in range(1, len(climate_factors)):
    for i in range(4):
        climate_factors_ext.append(climate_factors[j]+'_' +str(2002+i*5))

population_factors_ext = []
for i in range(5):
    population_factors_ext.append('population_density_GPWv4'+'_'+str(2000+i*5))
regions_ext = []
for i in range(5,9):
    regions_ext.append('region'+str(i))

columns =   DEM_factors + climate_factors_ext + population_factors_ext + regions_ext

## Create a pandas dataframe

In [None]:
df = pd.DataFrame(dataset, columns=columns)
df

## Compute statistical data on the Dataframe for all the columns

In [None]:

pd.set_option('display.max_columns', None)

df.describe(include = 'all')


## Have a list of all the features

In [None]:
features = DEM_factors + climate_factors_ext + population_factors_ext

# Train a Random Forest Classifier for each regionalization and display:
1. The accuracy of the training and the test
1. The accuract, the preceision, the recall, and F1-score of the prediction
1. The confusion Matrix
1. The most important features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from IPython.display import display, Markdown

import operator
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go


accuracies_train = []
accuracies_test = []
precisions_test = []
recalls_test = []
f1s_test = []
misclassified_rows = []
for i in range(5,9):

    X_train, X_test, y_train, y_test = train_test_split(df.drop(regions_ext, axis=1), df[f"region{i}"], test_size=0.2)
   
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    
    y_train_pred = clf.predict(X_train)
    accuracies_train.append(accuracy_score(y_train, y_train_pred))
       
    display(Markdown(f"---"))
    display(Markdown(f"# Results for {i} Regions"))
    
    clf.predict(X_test)
    
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    

    display(Markdown(f"### Accuracy (Test and Training) comparison"))
    accuracy_df = pd.DataFrame({
        'Accuracy': ['Training', 'Test'],
        'Value': [accuracy, accuracies_train[-1]]
    })
    display(accuracy_df)

    display(Markdown(f"### Evaluation Metrics for the Prediction"))
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        'Value': [accuracy, precision, recall, f1]
    })
    display(metrics_df)

    accuracies_test.append(accuracy)
    precisions_test.append(precision)
    recalls_test.append(recall)
    f1s_test.append(f1)
    

    # Create a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    display(Markdown(f"## Confusion Matrix"))
    

    # Plot the confusion matrix
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()        
    

    display(Markdown(f"## Features importance"))
    # Get feature importances
    feature_importances = clf.feature_importances_
    feature_names = np.array(list(features))  

    # Sort features based on importance
    sorted_idx = np.argsort(feature_importances)[::-1]

    # Plot the top N most important features
    top_n = 10  # You can adjust this based on the number of features you want to display
    plt.figure(figsize=(10, 6))
    plt.bar(range(top_n), feature_importances[sorted_idx][:top_n], align="center")
    plt.xticks(range(top_n), feature_names[sorted_idx][:top_n], rotation=90)
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.title(f"Top {top_n} Most Important Features - Random Forest Classifier")
    plt.show()
    


# Plot the Training and the Test accuracy according to the number of regions

In [None]:
import matplotlib.pyplot as plt

regions_c = [5,6,7,8]

# Plotting the training accuracy
plt.plot(regions_c, accuracies_train, label='Training Accuracy', marker='o')

# Plotting the test accuracy
plt.plot(regions_c, accuracies_test, label='Test Accuracy', marker='o')

# Adding labels and title
plt.xlabel('Number of Regions')
plt.ylabel('Accuracy')
plt.title('Training and Test Accuracy Over Region Numbers')

# Adding legend
plt.legend()

# Save the plot as an image
plt.savefig('accuracy_plot.png')

# Display the plot
plt.show()

# Plot the  Test Precision according to the number of regions

In [None]:
import matplotlib.pyplot as plt

regions_c = [5,6,7,8]



# Plotting the test accuracy
plt.plot(regions_c, precisions_test, label='Test Precision', marker='o')

# Adding labels and title
plt.xlabel('Number of Regions')
plt.ylabel('Precision')
plt.title('Test Precision Over Region Numbers')

# Adding legend
plt.legend()

# Save the plot as an image
plt.savefig('precision_plot.png')

# Display the plot
plt.show()

# Plot the the Test Recall according to the number of regions

In [None]:
import matplotlib.pyplot as plt

regions_c = [5,6,7,8]



# Plotting the test accuracy
plt.plot(regions_c, recalls_test, label='Test Recall', marker='o')

# Adding labels and title
plt.xlabel('Number of Regions')
plt.ylabel('Recall')
plt.title('Test Recall Over Region Numbers')

# Adding legend
plt.legend()

# Save the plot as an image
plt.savefig('recall_plot.png')

# Display the plot
plt.show()

# Plot the the Test F1-Score according to the number of regions

In [None]:
import matplotlib.pyplot as plt

regions_c = [5,6,7,8]



# Plotting the test accuracy
plt.plot(regions_c, f1s_test, label='Test F1 Score', marker='o')

# Adding labels and title
plt.xlabel('Number of Regions')
plt.ylabel('F1 Score')
plt.title('Test F1 Score Over Region Numbers')

# Adding legend
plt.legend()

# Save the plot as an image
plt.savefig('f1_plot.png')

# Display the plot
plt.show()