In [49]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from utils import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline as ImbPipeline

In [50]:
X = pd.read_csv("TrainningInput_Dataset_myseg.csv",index_col = 0)
print(X.shape)

y = pd.read_csv("TrainningOutput_Dataset_myseg.csv",index_col=0)
print(y.shape)

X_test = pd.read_csv("TestingInput_Dataset_myseg.csv",index_col=0)

# Shuffle the data
X = X.sample(n=X.shape[0])
y = y.loc[X.index]

# Here create other feature from the original ones. Maybe compute body surface, first i did imc but what I really wanted was like the average organ size of a patient so I took instead a measure of body area.. Then remove the feature height and weight.
add_body_surface_area_feature(X)
X.drop(columns=["Height", "Weight"],axis=1,inplace=True)
add_ratio_features(X)

add_body_surface_area_feature(X_test)
X_test.drop(columns=["Height","Weight"],axis =1,inplace = True)
add_ratio_features(X_test)


(100, 8)
(100, 1)
body surface are feature added modified
body surface are feature added modified


In [51]:
pipeline = ImbPipeline([
    ("dataAugment",GaussianNoiseInjector()),
    ("normaliser" , MinMaxScaler()),
    ("classifier", RandomForestClassifier()),
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'dataAugment', 'normaliser', 'classifier', 'dataAugment__noise_factor', 'dataAugment__random_state', 'normaliser__clip', 'normaliser__copy', 'normaliser__feature_range', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__monotonic_cst', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])

In [52]:
param_grid = {
    'dataAugment__noise_factor' : [0.05,0.0],
    'classifier__n_estimators': [100,300,500,1000],
    'classifier__max_features': ['sqrt',0.1,0.3],
    'classifier__max_depth': [5,15],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': [2],
}

In [None]:
grid_search = GridSearchCV(pipeline,param_grid=param_grid,cv=5,verbose=3,return_train_score=True)
grid_search.fit(X,y["Category"])

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=100, dataAugment__noise_factor=0.05;, score=(train=1.000, test=0.850) total time=   0.4s
[CV 2/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=100, dataAugment__noise_factor=0.05;, score=(train=0.988, test=0.950) total time=   0.6s
[CV 3/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=100, dataAugment__noise_factor=0.05;, score=(train=0.988, test=1.000) total time=   0.5s
[CV 4/5] END classifier__max_depth=5, classifier__max_features=sqrt, classifier__min_samples_leaf=2, classifier__min_samples_split=2, classifier__n_estimators=100, dataAugment__noise_factor=0.05

The model is fitted and now we want to properly evaluate the results. 
We select only the result from the best_params founded by the search. (Reminder that the best params are the one that provided the best mean validation score)

In [None]:
# set of parameters that gave the best cv result
best_params = grid_search.best_params_
print(f"best parameters : {best_params} ")

# Detailed result of the cross validation for each set of parameters
results = grid_search.cv_results_

# CV result for the best paramaters.

# GOAL : 
# The closest to 1 the mean score is on the val set the better
# The smallest the std on the val set the better.
best_idx = results['params'].index(best_params)
mean_train_score = results['mean_train_score'][best_idx]
mean_valid_score = results['mean_test_score'][best_idx]
std_train_score = results['std_train_score'][best_idx]
std_valid_score = results['std_test_score'][best_idx]
print(f"score validation set : {mean_valid_score,std_valid_score}")
print(f"score trainning set : {mean_train_score,std_train_score}")

{'classifier__max_depth': 5, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100, 'dataAugment__noise_factor': 0.0}
score validation set : (np.float64(0.8099999999999999), np.float64(0.07348469228349536))
score trainning set : (np.float64(1.0), np.float64(0.0))


In [21]:
# CONSTRUCTION DE LA PIPELINE D"INFERENCE

best_pipeline = grid_search.best_estimator_
inference_steps = [
    (name, step)
    for name, step in best_pipeline.steps
    if name != "dataAugment" # on ne bruite plus les données 
]
inf_pipeline = Pipeline(inference_steps)
print(inf_pipeline.get_params())

{'memory': None, 'steps': [('normaliser', StandardScaler()), ('classifier', RandomForestClassifier(max_depth=15, min_samples_leaf=2))], 'transform_input': None, 'verbose': False, 'normaliser': StandardScaler(), 'classifier': RandomForestClassifier(max_depth=15, min_samples_leaf=2), 'normaliser__copy': True, 'normaliser__with_mean': True, 'normaliser__with_std': True, 'classifier__bootstrap': True, 'classifier__ccp_alpha': 0.0, 'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': 15, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__max_samples': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__monotonic_cst': None, 'classifier__n_estimators': 100, 'classifier__n_jobs': None, 'classifier__oob_score': False, 'classifier__random_state': None, 'classifier__verbose': 0, 'classifier__warm_star

In [None]:
f_importance = grid_search.best_estimator_.named_steps["classifier"].feature_importances_
f_name = grid_search.best_estimator_.named_steps["normaliser"].get_feature_names_out()
feature_importance = pd.DataFrame({
    "feature": f_name,
    "importance": f_importance
})
feature_importance.sort_values("importance", ascending=False, inplace=True)
print(feature_importance)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

# Get global trainning score on the whole trainning data 
y_pred = inf_pipeline.predict(X)
cm = confusion_matrix(y,y_pred)
acc = accuracy_score(y,y_pred)
print(acc)
print(cm)

0.99
[[20  0  0  0  0]
 [ 0 19  1  0  0]
 [ 0  0 20  0  0]
 [ 0  0  0 20  0]
 [ 0  0  0  0 20]]


In [24]:
# Get the probabilities to get better insight on the model performance
proba = inf_pipeline.predict_proba(X)


# Below 
treshold = 0.4
L = [0,0,0,0,0]
for x in proba : 
    a= 0
    index = []
    for ind,i in enumerate(x) :
        if i > treshold : 
            a+=1
            index.append(ind)
    if a >=2 : 
        for j in index : 
            L[j] +=1  
print(proba)  
print(L)

[[0.0325     0.0075     0.015      0.         0.945     ]
 [0.29983333 0.09083333 0.04583333 0.0425     0.521     ]
 [0.01       0.07866667 0.039      0.         0.87233333]
 [0.         0.         0.         1.         0.        ]
 [0.11416667 0.01       0.         0.8675     0.00833333]
 [0.03333333 0.         0.         0.96166667 0.005     ]
 [0.74       0.005      0.         0.21833333 0.03666667]
 [0.99666667 0.         0.         0.         0.00333333]
 [0.00333333 0.18       0.78666667 0.005      0.025     ]
 [0.00333333 0.002      0.         0.016      0.97866667]
 [0.         0.02833333 0.01666667 0.955      0.        ]
 [0.18583333 0.005      0.         0.78166667 0.0275    ]
 [0.         0.14833333 0.83416667 0.0075     0.01      ]
 [0.98       0.         0.         0.         0.02      ]
 [0.965      0.         0.         0.00333333 0.03166667]
 [0.         0.         0.005      0.         0.995     ]
 [0.         0.17583333 0.82416667 0.         0.        ]
 [0.         0

In [None]:
# Predict on the test set

submission_name = "submission_10.csv"
submission_dataframe = pd.DataFrame(columns=["Id","Category"])
submission_dataframe["Id"] = X_test.index + 101

y_test_pred = inf_pipeline.predict(X_test)
submission_dataframe["Category"] = y_test_pred
submission_dataframe.to_csv(os.path.join(os.getcwd(),submission_name),index=False)

print("File saved")

file saved


Below two cells to save the results : the model and the description of the method used.

In [16]:
description = "Data augmentation cleaned + MinmaxScaler + Randomforest with Gridsearch." 
other_params = "The features are just the volume of each segmentation + body surface + all the possible ratios."
name_folder = "RF_data_aug_noise_pipeline"
feature_used = f_name
informationDict = {
    "description": description,
    "model parameters" : best_params,
    "features used" : feature_used,
    "mean test accuracy with best params" : mean_valid_score ,
    "std  test with best params" : std_valid_score,
    "mean train accuracy with best params" : mean_train_score,
    "std train best params" : std_train_score,
    "other parms" : other_params,
    
}

In [17]:
import joblib
from datetime import datetime
import os

# Create a timestamp
currentDateTime = datetime.now()

# Get the base directory (current directory)
BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")
RESULT_DIR = os.path.join(BASE_DIR,"output")
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)
    print(f"Directory created: {RESULT_DIR}")

# Create a folder named 'pipeline_<timestamp>' in the current directory
dir_name = name_folder
dir_path = os.path.join(RESULT_DIR, dir_name)

# If the directory doesn't exist, create it
if not os.path.exists(dir_path):
    os.mkdir(dir_path)
    print(f"Directory created: {dir_path}")

# Save the model inside this new folder
model_filename = 'pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.pkl'
model_path = os.path.join(dir_path, model_filename)

# This is where you'd have your model defined
joblib.dump(grid_search, model_path)
print(f"Model saved to: {model_path}")


# Saving feature importance : 
feature_importance_filename ='pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.csv' 
feature_importance_dir = os.path.join(dir_path,feature_importance_filename)
feature_importance.to_csv(feature_importance_dir)


# SAVING Description 
dict_filename = 'params.txt'
dict_path = os.path.join(dir_path, dict_filename)

with open(dict_path, 'w') as f:
    for key, val in informationDict.items():
        f.write(f"{key} : {val}\n")
print(f"Information about the model saved to: {dict_path}")


BASE_DIR: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction


Model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/RF_data_aug_noise_pipeline/pipeline_01-08-15.pkl
Information about the model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/output/RF_data_aug_noise_pipeline/params.txt
