In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [20]:
data = pd.read_csv('../../CBlog2/predicting_heart_failure/heart_failure_clinical_records_dataset.csv')

X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#Loop to seperate X data into categorical and numerical groups
#The categorical features are booleans, thus the script classifies any column\
#with equa or less than 2 unique values as a categorical
categoricals_columns = []
numericals_columns = []
for i in list(X.columns):
    if data[i].nunique() > 2:
        numericals_columns.append(i)
    else:
        categoricals_columns.append(i)

categoricals = X[categoricals_columns]
numericals = X[numericals_columns]

ss = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('numericals',
        ss,
        numericals_columns
        )
    ]
)
numericals_scaled = pd.DataFrame(preprocessor.fit_transform(X), columns=numericals_columns)
X_processed = pd.concat([numericals_scaled,categoricals], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [28]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

rf_param_grid = {
    'max_depth':range(2,7),
    'n_estimators':range(10,30)
}

gb_param_grid = {
    'loss':['deviance','exponential'],
    'max_depth':range(2,7),
    'max_features':['sqrt','log2']
}

def model_fitter(model):
    model.fit(X_train, y_train)
    print('Training Score: {}'.format(model.score(X_train, y_train)))
    y_pred = model.predict(X_test)
    print('Test Score {}:'.format(accuracy_score(y_test, y_pred)))
    
def grid_search(model,param_grid):
    grid_search= GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        n_jobs=-1,
        cv=5
    )
    grid_search_rf.fit(
        X_train,
        y_train
    )
    print('Best Parameters were: {}'.format(grid_search.best_params_))
    print('Best CrossVal Score was: {}'.format(grid_search.best_score_))

In [29]:
grid_search(rf, rf_param_grid)

Best Parameters were: {'max_depth': 6, 'n_estimators': 16}
Best CrossVal Score was: 0.8703014184397162


In [30]:
grid_search(gb, gb_param_grid)

Best Parameters were: {'loss': 'exponential', 'max_depth': 2, 'max_features': 'log2'}
Best CrossVal Score was: 0.8494680851063829


In [31]:
updated_rf = RandomForestClassifier(max_depth=6, n_estimators=16)
model_fitter(updated_rf)

Training Score: 0.9790794979079498
Test Score 0.8666666666666667:
