In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [4]:
X = pd.read_csv("TrainningInput_Dataset.csv")
X.drop(columns=X.columns[0],axis=1,inplace=True) # just bc in my dataset i have a unused columns at the beginning
print(X.shape)

y = pd.read_csv("TrainningOutput_Dataset.csv")
y.drop(columns=y.columns[0],axis=1,inplace=True)
print(y.shape)

(100, 12)
(100, 2)


In [None]:
# Shuffle the data
X = X.sample(n=X.shape[0])
y = y.loc[X.index]
print(y.head())
print(X.head())


    Id  Category
94  95         4
21  22         3
61  62         0
19  20         2
6    7         2
    Id  ED_RV_volume  ED_LV_volume  ED_MY_volume  ES_RV_volume  ES_LV_volume  \
94  95         17350         10124          5047         11102          4519   
21  22          3028          3628          3563          1691          1272   
61  62          9704          8634          7093          5200          3730   
19  20          5468          7241          3895          5253          5989   
6    7          8371          8810          5225          6087          7887   

    ES_MY_volume  RV_DIFF  LV_DIFF  MY_DIFF  Height  Weight  
94          6578     6248     5605     1531   165.0    76.0  
21          4096     1337     2356      533   165.0    42.0  
61          8340     4504     4904     1247   172.0    74.0  
19          4275      215     1252      380   182.0   106.0  
6           5233     2284      923        8   173.0   107.0  


In [17]:
pipeline = Pipeline([
    ("normaliser" , MinMaxScaler()),
    ("classifier", RandomForestClassifier()),
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'normaliser', 'classifier', 'normaliser__clip', 'normaliser__copy', 'normaliser__feature_range', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__monotonic_cst', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])

In [18]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

In [19]:
grid_search = GridSearchCV(pipeline,param_grid=param_grid,cv=3,verbose=3)
grid_search.fit(X.drop(columns = ["Id"]),y["Category"])

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV 1/3] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=0.853 total time=   0.4s
[CV 2/3] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=0.727 total time=   0.5s
[CV 3/3] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100;, score=0.879 total time=   0.4s
[CV 1/3] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200;, score=0.853 total time=   0.5s
[CV 2/3] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, clas

In [22]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [23]:
description = "MinmaxScaler + Randomforest with gridsearch" 
other_params = ""

informationDict = {
    "description": description,
    "model parameters" : best_params,
    "other parms" : other_params,
    
}

In [24]:
import joblib
from datetime import datetime
import os

# Create a timestamp
currentDateTime = datetime.now()

# Get the base directory (current directory)
BASE_DIR = os.getcwd()
print(f"BASE_DIR: {BASE_DIR}")
RESULT_DIR = os.path.join(BASE_DIR,"output")
if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)
    print(f"Directory created: {RESULT_DIR}")

# Create a folder named 'pipeline_<timestamp>' in the current directory
dir_name = 'pipeline_' + currentDateTime.strftime("%H-%M-%S")
dir_path = os.path.join(RESULT_DIR, dir_name)

# If the directory doesn't exist, create it
if not os.path.exists(dir_path):
    os.mkdir(dir_path)
    print(f"Directory created: {dir_path}")

# Save the model inside this new folder
model_filename = 'pipeline_' + currentDateTime.strftime("%H-%M-%S") + '.pkl'
model_path = os.path.join(dir_path, model_filename)

# This is where you'd have your model defined
joblib.dump(grid_search, model_path)
print(f"Model saved to: {model_path}")

# SAVING Description 
dict_filename = 'params.txt'
dict_path = os.path.join(dir_path, dict_filename)

with open(dict_path, 'w') as f:
    for key, val in informationDict.items():
        f.write(f"{key} : {val}\n")
print(f"Information about the model saved to: {dict_path}")


BASE_DIR: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction
Directory created: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/pipeline_15-41-02
Model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/pipeline_15-41-02/pipeline_15-41-02.pkl
Information about the model saved to: /Users/rplanchon/Documents/telecom/IMA/S2/IMA205/Challenge/CardiacPathoPrediction/pipeline_15-41-02/params.txt
