In [1]:
import os
os.chdir('../')

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/pre-model/data.csv')

In [3]:
X, y  = df.drop('Estado del egreso', axis = 1), df['Estado del egreso']

In [4]:
rf_params = {
    'n_estimators': [200, 300, 350],
    'max_depth': [6, 10, 12],
    'min_samples_leaf': [2, 4, 6],
    'min_samples_split': [2, 4, 10],
    'criterion': ['gini', 'entropy']
}

In [6]:
rf = RandomForestClassifier(random_state = 1)

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid = GridSearchCV(rf, rf_params,scoring= 'f1', cv=skf)
grid.fit(X,y)
print(grid.best_params_,grid.best_score_)
best= grid.best_estimator_

{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 200} 0.8933333333333333


In [11]:
from sklearn.feature_selection import SelectFromModel
import copy

selector = SelectFromModel(best,threshold='mean').fit(X,y)
X_select = selector.transform(X)

feature_names = X.columns[selector.get_support()]
X_select_df = pd.DataFrame(X_select, columns=feature_names)

rf_ = copy.deepcopy(rf)
tuned_rf = rf.fit(X_select_df, y)

feature_names

Index(['Ventilacion', 'Shock septico', 'Trastorno metabòlico',
       'Acidosis metabòlica'],
      dtype='object')

In [13]:
f1 = cross_val_score(tuned_rf, X, y, cv=skf, scoring='f1').mean()
f1

0.831048951048951

In [14]:
import pickle as pkl
with open("models/rf.pkl",'wb') as f:
    pkl.dump(tuned_rf,f)
