In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from utils import *

In [5]:
X = pd.read_csv("TrainningInput_Dataset_myseg.csv",index_col = 0)
print(X.shape)

y = pd.read_csv("TrainningOutput_Dataset_myseg.csv",index_col=0)
print(y.shape)


# Shuffle the data
X = X.sample(n=X.shape[0])
y = y.loc[X.index]

# Here create other feature from the original ones. Maybe compute imc of each person and divide each feature by that. Then remove the feature height and weight.
add_body_surface_area_feature(X)
X.drop(columns=["Height", "Weight"],axis=1,inplace=True)
add_ratio_features(X)

(100, 8)
(100, 1)
body surface are feature added modified


In [7]:

print(X.columns)


Index(['ED_RV_volume', 'ED_LV_volume', 'ED_MY_volume', 'ES_RV_volume',
       'ES_LV_volume', 'ES_MY_volume', 'body_surface',
       'ED_LV_volume_div_ED_MY_volume', 'ED_LV_volume_div_ES_RV_volume',
       'ED_LV_volume_div_ES_LV_volume', 'ED_LV_volume_div_ES_MY_volume',
       'ED_LV_volume_div_body_surface', 'ED_MY_volume_div_ED_LV_volume',
       'ED_MY_volume_div_ES_RV_volume', 'ED_MY_volume_div_ES_LV_volume',
       'ED_MY_volume_div_ES_MY_volume', 'ED_MY_volume_div_body_surface',
       'ES_RV_volume_div_ED_LV_volume', 'ES_RV_volume_div_ED_MY_volume',
       'ES_RV_volume_div_ES_LV_volume', 'ES_RV_volume_div_ES_MY_volume',
       'ES_RV_volume_div_body_surface', 'ES_LV_volume_div_ED_LV_volume',
       'ES_LV_volume_div_ED_MY_volume', 'ES_LV_volume_div_ES_RV_volume',
       'ES_LV_volume_div_ES_MY_volume', 'ES_LV_volume_div_body_surface',
       'ES_MY_volume_div_ED_LV_volume', 'ES_MY_volume_div_ED_MY_volume',
       'ES_MY_volume_div_ES_RV_volume', 'ES_MY_volume_div_ES_LV_volume'

In [7]:
pipeline = Pipeline([
    ("normaliser" , MinMaxScaler()),
    ("classifier", RandomForestClassifier()),
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'normaliser', 'classifier', 'normaliser__clip', 'normaliser__copy', 'normaliser__feature_range', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__monotonic_cst', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])

In [8]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['sqrt'],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1,2, 4],
}

In [9]:
grid_search = GridSearchCV(pipeline,param_grid=param_grid,cv=5,verbose=3,return_train_score=True)
grid_search.fit(X.drop(columns = ["Id"]),y["Category"])

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=(train=1.000, test=0.750) total time=   0.4s
[CV 2/5] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=(train=1.000, test=0.800) total time=   0.6s
[CV 3/5] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=(train=1.000, test=0.900) total time=   0.5s
[CV 4/5] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=(train=1.000, test=0.900) total time=   0.5s
[CV 5/5] END classifier__max_depth=None, classifier__max_feat

The model is fitted and now we want to properly evaluate the results. 
We select only the result from the best_params founded by the search. (Reminder that the best params are the one that provided the best mean validation score)

In [12]:
best_params = grid_search.best_params_
print(best_params)
results = grid_search.cv_results_

# Only keep the cv result for the best paramaters.
best_idx = results['params'].index(best_params)

mean_train_score = results['mean_train_score'][best_idx]
mean_valid_score = results['mean_test_score'][best_idx]
std_train_score = results['std_train_score'][best_idx]
std_valid_score = results['std_test_score'][best_idx]
print(mean_valid_score,std_valid_score)
print(mean_train_score,std_train_score)

{'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
0.85 0.05477225575051661
0.9824999999999999 0.0061237243569579785


In [13]:
print(results)

{'mean_fit_time': array([0.47662115, 1.1269908 , 0.87832651, 0.31405125, 0.52149487,
       0.84738569, 0.32910724, 0.50746398, 0.6892427 , 0.32753186,
       0.69953051, 0.72795067, 0.34104586, 0.58115959, 0.66923881,
       0.34671941, 0.78313217, 0.73352871, 0.21528335, 0.42625856,
       0.6317204 , 0.21487761, 0.42760653, 0.7763423 , 0.22087836,
       0.4582612 , 0.77836118, 0.21579638, 0.42755342, 0.83503957,
       0.26661911, 0.43335404, 0.63221374, 0.24786425, 0.43417869,
       0.63395948, 0.2186482 , 0.43138795, 0.62685876, 0.21031637,
       0.41947904, 1.17579145, 0.3699388 , 0.58502836, 0.72207031,
       0.23245578, 0.63178234, 0.67372737, 0.26909018, 0.60071173,
       0.80327501, 0.27944055, 0.61017842, 0.79840326, 0.41377201,
       0.63079314, 1.03429155, 0.36805968, 0.54647899, 0.80406957,
       0.23425341, 0.50684938, 0.86724558, 0.46434641, 0.48644009,
       1.16436853, 0.35203543, 0.59093876, 1.11448407, 0.48423905,
       0.57599921, 1.02558885, 0.58840952, 0

In [14]:
f_importance = grid_search.best_estimator_.named_steps["classifier"].feature_importances_
f_name = grid_search.best_estimator_.named_steps["normaliser"].get_feature_names_out()
feature_importance = pd.DataFrame({
    "feature": f_name,
    "importance": f_importance
})
feature_importance.sort_values("importance", ascending=False, inplace=True)
print(feature_importance)

         feature  importance
4   ES_LV_volume    0.247816
3   ES_RV_volume    0.146357
1   ED_LV_volume    0.109677
0   ED_RV_volume    0.107192
5   ES_MY_volume    0.093071
2   ED_MY_volume    0.081032
7        LV_DIFF    0.065101
6        RV_DIFF    0.048252
8        MY_DIFF    0.030285
9         Height    0.027166
10        Weight    0.022417
11  body_surface    0.021634


Below two cells to save the results : the model and the description of the method used.

In [11]:
description = "MinmaxScaler + Randomforest with Gridsearch" 
other_params = "The features are just the volume of each segmentation, also the difference between the 2 timesteps"
feature_used = f_name
informationDict = {
    "description": description,
    "model parameters" : best_params,
    "features used" : feature_used,
    "other parms" : other_params,
    
}

In [100]:
import joblib
from datetime import datetime
import os

# Create a timestamp
currentDateTime = datetime.now()

# Get the base directory (current directory)
BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")
RESULT_DIR = os.path.join(BASE_DIR,"output")
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)
    print(f"Directory created: {RESULT_DIR}")

# Create a folder named 'pipeline_<timestamp>' in the current directory
dir_name = 'pipeline_' + currentDateTime.strftime("%H-%M-%S")
dir_path = os.path.join(RESULT_DIR, dir_name)

# If the directory doesn't exist, create it
if not os.path.exists(dir_path):
    os.mkdir(dir_path)
    print(f"Directory created: {dir_path}")

# Save the model inside this new folder
model_filename = 'pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.pkl'
model_path = os.path.join(dir_path, model_filename)

# This is where you'd have your model defined
joblib.dump(grid_search, model_path)
print(f"Model saved to: {model_path}")


# Saving feature importance : 
feature_importance_filename ='pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.csv' 
feature_importance_dir = os.path.join(dir_path,feature_importance_filename)
feature_importance.to_csv(feature_importance_dir)


# SAVING Description 
dict_filename = 'params.txt'
dict_path = os.path.join(dir_path, dict_filename)

with open(dict_path, 'w') as f:
    for key, val in informationDict.items():
        f.write(f"{key} : {val}\n")
print(f"Information about the model saved to: {dict_path}")


BASE_DIR: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction
Directory created: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/pipeline_18-19-26
Model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/pipeline_18-19-26/pipeline_18-19-26.pkl
Information about the model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/pipeline_18-19-26/params.txt
