In [29]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from utils import *

In [32]:
X = pd.read_csv("TrainningInput_Dataset_myseg.csv",index_col = 0)
print(X.shape)

y = pd.read_csv("TrainningOutput_Dataset_myseg.csv",index_col=0)
print(y.shape)

X_test = pd.read_csv("TestingInput_Dataset_myseg.csv",index_col=0)

# Shuffle the data
X = X.sample(n=X.shape[0])
y = y.loc[X.index]

# Here create other feature from the original ones. Maybe compute body surface, first i did imc but what I really wanted was like the average organ size of a patient so I took instead a measure of body area.. Then remove the feature height and weight.
add_body_surface_area_feature(X)
X.drop(columns=["Height", "Weight"],axis=1,inplace=True)
add_ratio_features(X)

add_body_surface_area_feature(X_test)
X_test.drop(columns=["Height","Weight"],axis =1,inplace = True)
add_ratio_features(X_test)

X,y = augment_data(X,y)
X = X.map(lambda x: max(0, x))# because we min max after

(100, 8)
(100, 1)
body surface are feature added modified
body surface are feature added modified


In [33]:

print(X.columns)
print(X_test.columns)


Index(['ED_RV_volume', 'ED_LV_volume', 'ED_MY_volume', 'ES_RV_volume',
       'ES_LV_volume', 'ES_MY_volume', 'body_surface',
       'ED_RV_volume_div_ED_LV_volume', 'ED_RV_volume_div_ED_MY_volume',
       'ED_RV_volume_div_ES_RV_volume', 'ED_RV_volume_div_ES_LV_volume',
       'ED_RV_volume_div_ES_MY_volume', 'ED_RV_volume_div_body_surface',
       'ED_LV_volume_div_ED_RV_volume', 'ED_LV_volume_div_ED_MY_volume',
       'ED_LV_volume_div_ES_RV_volume', 'ED_LV_volume_div_ES_LV_volume',
       'ED_LV_volume_div_ES_MY_volume', 'ED_LV_volume_div_body_surface',
       'ED_MY_volume_div_ED_RV_volume', 'ED_MY_volume_div_ED_LV_volume',
       'ED_MY_volume_div_ES_RV_volume', 'ED_MY_volume_div_ES_LV_volume',
       'ED_MY_volume_div_ES_MY_volume', 'ED_MY_volume_div_body_surface',
       'ES_RV_volume_div_ED_RV_volume', 'ES_RV_volume_div_ED_LV_volume',
       'ES_RV_volume_div_ED_MY_volume', 'ES_RV_volume_div_ES_LV_volume',
       'ES_RV_volume_div_ES_MY_volume', 'ES_RV_volume_div_body_surface'

In [34]:
pipeline = Pipeline([
    ("normaliser" , MinMaxScaler()),
    ("classifier", RandomForestClassifier()),
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'normaliser', 'classifier', 'normaliser__clip', 'normaliser__copy', 'normaliser__feature_range', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__monotonic_cst', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])

In [50]:
param_grid = {
    'classifier__n_estimators': [300,500],
    'classifier__max_features': ['sqrt'],
    'classifier__max_depth': [5],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [4],
}

In [51]:
grid_search = GridSearchCV(pipeline,param_grid=param_grid,cv=5,verbose=3,return_train_score=True)
grid_search.fit(X,y["Category"])

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=300;, score=(train=0.988, test=0.950) total time=   1.5s
[CV 2/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=300;, score=(train=0.988, test=1.000) total time=   1.7s
[CV 3/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=300;, score=(train=0.975, test=0.950) total time=   1.4s
[CV 4/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=4, classifier__min_samples_split=2, classifier__n_estimators=300;, score=(train=0.988, test=0.950) total time=   0.9s
[CV 5/5] END classifier__max_depth=5, classifier__max_features=sqrt, classif

The model is fitted and now we want to properly evaluate the results. 
We select only the result from the best_params founded by the search. (Reminder that the best params are the one that provided the best mean validation score)

In [52]:
best_params = grid_search.best_params_
print(best_params)
results = grid_search.cv_results_

# Only keep the cv result for the best paramaters.
best_idx = results['params'].index(best_params)

mean_train_score = results['mean_train_score'][best_idx]
mean_valid_score = results['mean_test_score'][best_idx]
std_train_score = results['std_train_score'][best_idx]
std_valid_score = results['std_test_score'][best_idx]
print(mean_valid_score,std_valid_score)
print(mean_train_score,std_train_score)

{'classifier__max_depth': 5, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500}
0.975 0.031622776601683784
0.9862499999999998 0.0025000000000000356


In [53]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = grid_search.predict(X)
acc = accuracy_score(y,y_pred)
cm = confusion_matrix(y,y_pred)
print(acc)
print(cm)

0.99
[[40  0  0  0  0]
 [ 0 38  2  0  0]
 [ 0  0 40  0  0]
 [ 0  0  0 40  0]
 [ 0  0  0  0 40]]


In [54]:
submission_name = "submission_2.csv"
submission_dataframe = pd.DataFrame(columns=["Id","Category"])
submission_dataframe["Id"] = X_test.index + 101
y_test_pred = grid_search.predict(X_test)
submission_dataframe["Category"] = y_test_pred
submission_dataframe.to_csv(os.path.join(os.getcwd(),submission_name),index=False)
print("file saved")

file saved


In [27]:
print(results)

{'mean_fit_time': array([1.66860676, 1.76617975, 5.23960056, 5.26004319, 0.82938786,
       2.50543985, 3.70865674, 5.045818  , 1.15579391, 2.10179529,
       3.34860697, 4.31891189, 0.8006321 , 1.28879881, 3.99978971,
       4.21185417, 0.79775162, 1.33413959, 2.68374391, 4.00977054,
       0.78856173, 1.60755758, 3.8933794 , 4.35188203, 0.94931269,
       1.3464457 , 2.61525879, 4.01058249, 1.0896111 , 1.61770678,
       2.80235019, 4.13106999, 0.83130341, 1.55077243, 3.23999305,
       4.38319221]), 'std_fit_time': array([0.12122048, 0.18540965, 1.15698222, 2.20758309, 0.05692167,
       0.97601986, 0.67213855, 0.75341022, 0.29449955, 0.35342636,
       0.32192284, 0.76364526, 0.0383269 , 0.00774888, 0.75945167,
       0.52689734, 0.00988459, 0.01728037, 0.10445745, 0.25124698,
       0.0031903 , 0.55067989, 0.60730857, 0.19120853, 0.06259768,
       0.0129992 , 0.00775803, 0.13571682, 0.35194696, 0.37195948,
       0.08996565, 0.09506407, 0.0056158 , 0.42412159, 0.54404629,
       

In [28]:
f_importance = grid_search.best_estimator_.named_steps["classifier"].feature_importances_
f_name = grid_search.best_estimator_.named_steps["normaliser"].get_feature_names_out()
feature_importance = pd.DataFrame({
    "feature": f_name,
    "importance": f_importance
})
feature_importance.sort_values("importance", ascending=False, inplace=True)
print(feature_importance)

                          feature  importance
32  ES_LV_volume_div_ED_LV_volume    0.054883
16  ED_LV_volume_div_ES_LV_volume    0.052234
10  ED_RV_volume_div_ES_LV_volume    0.047591
21  ED_MY_volume_div_ES_RV_volume    0.047373
31  ES_LV_volume_div_ED_RV_volume    0.046707
26  ES_RV_volume_div_ED_LV_volume    0.044277
27  ES_RV_volume_div_ED_MY_volume    0.040544
40  ES_MY_volume_div_ES_RV_volume    0.039135
29  ES_RV_volume_div_ES_MY_volume    0.037876
33  ES_LV_volume_div_ED_MY_volume    0.033385
34  ES_LV_volume_div_ES_RV_volume    0.033124
35  ES_LV_volume_div_ES_MY_volume    0.031474
41  ES_MY_volume_div_ES_LV_volume    0.031373
36  ES_LV_volume_div_body_surface    0.031370
15  ED_LV_volume_div_ES_RV_volume    0.029350
4                    ES_LV_volume    0.028310
28  ES_RV_volume_div_ES_LV_volume    0.027394
47  body_surface_div_ES_LV_volume    0.025812
7   ED_RV_volume_div_ED_LV_volume    0.024197
22  ED_MY_volume_div_ES_LV_volume    0.023812
13  ED_LV_volume_div_ED_RV_volume 

Below two cells to save the results : the model and the description of the method used.

In [18]:
description = "Data augmentation + MinmaxScaler + Randomforest with Gridsearch." 
other_params = "The features are just the volume of each segmentation + body surface + all the possible ratios."
name_folder = "RF_data_aug_fixed_max_depth_high_nbtrees_"
feature_used = f_name
informationDict = {
    "description": description,
    "model parameters" : best_params,
    "features used" : feature_used,
    "mean test accuracy with best params" : mean_valid_score ,
    "std  test with best params" : std_valid_score,
    "mean train accuracy with best params" : mean_train_score,
    "std train best params" : std_train_score,
    "other parms" : other_params,
    
}

In [19]:
import joblib
from datetime import datetime
import os

# Create a timestamp
currentDateTime = datetime.now()

# Get the base directory (current directory)
BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")
RESULT_DIR = os.path.join(BASE_DIR,"output")
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)
    print(f"Directory created: {RESULT_DIR}")

# Create a folder named 'pipeline_<timestamp>' in the current directory
dir_name = name_folder
dir_path = os.path.join(RESULT_DIR, dir_name)

# If the directory doesn't exist, create it
if not os.path.exists(dir_path):
    os.mkdir(dir_path)
    print(f"Directory created: {dir_path}")

# Save the model inside this new folder
model_filename = 'pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.pkl'
model_path = os.path.join(dir_path, model_filename)

# This is where you'd have your model defined
joblib.dump(grid_search, model_path)
print(f"Model saved to: {model_path}")


# Saving feature importance : 
feature_importance_filename ='pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.csv' 
feature_importance_dir = os.path.join(dir_path,feature_importance_filename)
feature_importance.to_csv(feature_importance_dir)


# SAVING Description 
dict_filename = 'params.txt'
dict_path = os.path.join(dir_path, dict_filename)

with open(dict_path, 'w') as f:
    for key, val in informationDict.items():
        f.write(f"{key} : {val}\n")
print(f"Information about the model saved to: {dict_path}")


BASE_DIR: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction
Directory created: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/RF_data_aug_fixed_max_depth_high_nbtrees_
Model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/RF_data_aug_fixed_max_depth_high_nbtrees_/pipeline_23-57-01.pkl
Information about the model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/RF_data_aug_fixed_max_depth_high_nbtrees_/params.txt
