In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, classification_report, confusion_matrix
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import pickle

from main import CRLoader as Loader, FExtractorB as FExtractor

new_features = [gpd.read_file(f'/home/stagiaire/D/D/mesh/{i}_clean.shp') for i in range(1, 5)]

to_keep = ['p_vegeta', 'len_roads', 'profile_co', 'nb_connect', 'min_connec', 'slope', 'id']
for i in range(4):
    new_features[i] = new_features[i].drop(columns=set(new_features[i].columns) - set(to_keep))
all_new_features = pd.concat(new_features, ignore_index=True)

sample_size = 120
batch_size = 16

processor = Loader(opt_root_dir=f"/home/stagiaire/D/D/patchs/{3}", sar_root_dir=f"/home/stagiaire/D/D/patchs/{3}R", num_folds=1)
processor.load_data(sample_size=sample_size, batch_size=batch_size)
loader = processor.loaders[0]
feature_extractor = FExtractor(dataloader=loader, use_8_bit=True)
test_featuresC, test_labelsC, test_ids = feature_extractor.extract_features()

test_dict = {}
for i in range(len(test_ids)):
    test_dict[test_ids[i]] = [test_featuresC[i].tolist(), test_labelsC[i]]
new_te_features = all_new_features[all_new_features['id'].isin(test_ids)]
for index, row in new_te_features.iterrows():
    row_values = row[new_te_features.columns != 'id'].tolist()
    test_dict[row['id']][0] = (test_dict[row['id']][0], row_values)
test_featuresA = []
test_labelsA = []
test_ids = []
for key, value in test_dict.items():
    test_ids.append(key)
    test_featuresA.append(value[0][1])
    test_labelsA.append(value[1])

it = 10
metricsC = []
metricsA = []

predictionsC = np.array([[] for _ in range(len(test_ids))])
probsC = np.array([[] for _ in range(len(test_ids))])

predictionsA = np.array([[] for _ in range(len(test_ids))])
probsA = np.array([[] for _ in range(len(test_ids))])

feature_importanceC = np.array([[] for _ in range(768)])
feature_importanceA = np.array([[] for _ in range(6)])

for i in range(it):

    processor = Loader(opt_root_dir=f"/home/stagiaire/D/D/patchs/{412}", sar_root_dir=f"/home/stagiaire/D/D/patchs/{412}R", num_folds=1)
    processor.load_data(sample_size=sample_size, batch_size=batch_size)
    loader = processor.loaders[0]
    feature_extractor = FExtractor(dataloader=loader, use_8_bit=True)
    train_featuresC, train_labelsC, train_ids = feature_extractor.extract_features()
    
    train_dict = {}
    for i in range(len(train_ids)):
        train_dict[train_ids[i]] = [train_featuresC[i].tolist(), train_labelsC[i]]
    new_tr_feat = all_new_features[all_new_features['id'].isin(train_ids)]
    for index, row in new_tr_feat.iterrows():
        row_values = row[new_tr_feat.columns != 'id'].tolist()
        train_dict[row['id']][0] = (train_dict[row['id']][0], row_values)
    train_featuresA = []
    train_labelsA = []
    train_ids = []
    for key, value in train_dict.items():
        train_ids.append(key)
        train_featuresA.append(value[0][1])
        train_labelsA.append(value[1])
    
    rf_modelC = RandomForestClassifier(n_estimators=400)
    rf_modelC.fit(np.array(train_featuresC), np.array(train_labelsC))
    
    predictions = rf_modelC.predict(np.array(test_featuresC))
    predictions_inv, test_labels_inv = [1 if p == 0 else 0 for p in predictions], [1 if l == 0 else 0 for l in test_labelsC]
    predictionsC = np.column_stack((predictionsC, predictions_inv))

    probs = np.take(rf_modelC.predict_proba(np.array(test_featuresC)), 0, axis=1)
    probsC = np.column_stack((probsC, probs))

    feature_importanceC = np.hstack((feature_importanceC, (np.array(rf_modelC.feature_importances_).reshape(-1, 1))))
    
    IoU = jaccard_score(test_labels_inv, predictions_inv)
    cm = confusion_matrix(test_labels_inv, predictions_inv)
    df_cm = pd.DataFrame(cm, index=['Actual Class 0', 'Actual Class 1'], columns=['Predicted Class 0', 'Predicted Class 1'])
    cr = classification_report(test_labels_inv, predictions_inv)

    print(f"\nJaccard index: {IoU*100: 0.1f}%\n")
    print(f"\n{df_cm}\n")
    print(f"\n{cr}\n")
    metricsC.append([cm, cr, IoU])
                
    rf_modelA = RandomForestClassifier(n_estimators=400)
    rf_modelA.fit(np.array(train_featuresA), np.array(train_labelsA))
    
    predictions = rf_modelA.predict(np.array(test_featuresA))
    predictions_inv, test_labels_inv = [1 if p == 0 else 0 for p in predictions], [1 if l == 0 else 0 for l in test_labelsA]
    predictionsA = np.column_stack((predictionsA, predictions_inv))

    probs = np.take(rf_modelA.predict_proba(np.array(test_featuresA)), 0, axis=1)
    probsA = np.column_stack((probsA, probs))

    feature_importanceA = np.hstack((feature_importanceA, (np.array(rf_modelA.feature_importances_).reshape(-1, 1))))
    
    IoU = jaccard_score(test_labels_inv, predictions_inv)
    cm = confusion_matrix(test_labels_inv, predictions_inv)
    df_cm = pd.DataFrame(cm, index=['Actual Class 0', 'Actual Class 1'], columns=['Predicted Class 0', 'Predicted Class 1'])
    cr = classification_report(test_labels_inv, predictions_inv)

    print(f"\nJaccard index: {IoU*100: 0.1f}%\n")
    print(f"\n{df_cm}\n")
    print(f"\n{cr}\n")
    metricsA.append([cm, cr, IoU])

In [None]:
data = {}
for j in range(1, 5):
    root_dir = f"/home/stagiaire/D/D/patchs/{j}"  
    for folder in os.listdir(root_dir):
        root_folder = os.path.join(root_dir, folder)
        files = os.listdir(root_folder)
        for file in files:
            n = file.split('.')[0]
            data[n] = []
global_list = pd.DataFrame(list(data.items()), columns=['Id', 'PredictC'])
global_list = global_list.set_index('Id')
global_list['PredictA'] = [[] for _ in range(len(global_list))]

global_list['ProbaC'] = [[] for _ in range(len(global_list))]
global_list['ProbaA'] = [[] for _ in range(len(global_list))]

for i, id in enumerate(test_ids):
    id = str(id)
    global_list.at[id, 'PredictC'] = predictionsC[i].tolist()
    global_list.at[id, 'PredictA'] = predictionsA[i].tolist()
    global_list.at[id, 'ProbaC'] = probsC[i].tolist()
    global_list.at[id, 'ProbaA'] = probsA[i].tolist()
            
with open(f"/home/stagiaire/D/R/metrics/MA{it}3.pkl", 'wb') as f:
    pickle.dump(global_list, f)

In [None]:
from R import RGenerator

report = RGenerator(metricsC).report()
report = RGenerator(metricsA).report()

In [None]:
feature_importanceC = np.array([np.sum(f) for f in feature_importanceC])

n_top_features = 10
sorted_indices = np.argsort(feature_importanceC)[::-1][:n_top_features]
top_feature_importanceC = feature_importanceC[sorted_indices]
top_feature_names = sorted_indices

print(sorted_indices)

In [None]:
feature_importanceA = np.array([np.sum(f) for f in feature_importanceA])

n_top_features = 6
sorted_indices = np.argsort(feature_importanceA)[::-1][:n_top_features]
top_feature_importanceA = feature_importanceA[sorted_indices]
top_feature_names = sorted_indices

print(sorted_indices)