In [1]:
# On this notebook I'll build the SVC model 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Read the data
cardio_data=pd.read_csv("in/cardio_train.csv")
cardio_data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
cardio_data.groupby("cardio").size()
# The classes are already balanced

cardio
0    35021
1    34979
dtype: int64

In [4]:
# Split data into features and labels
X=cardio_data.iloc[:,1:12].values
y=cardio_data["cardio"].values

In [5]:
# Standarization and spliting process
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Also I'll split the dataset in val test, to have 3 sets from the initial one.
# X train = 56000 / X test = 14000 / X val = 14000
# With partial train I'll train the model, with test I'll improve the model and with test I'll evaluate the model
X_val = X_train[:14000]
partial_X_train = X_train[14000:]
X_test = X_test

y_val = y_train[:14000]
partial_y_train = y_train[14000:]
y_test = y_test

In [7]:
# Standarization of variables
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_train_std=std.fit_transform(partial_X_train)

In [8]:
# Remeber differences between fit and transform
X_test_std = std.transform(X_test)
X_val_std = std.transform(X_val)

In [9]:
# Applying the SVC
from sklearn.ensemble  import RandomForestClassifier
classifier_RFC = RandomForestClassifier().fit(X_train_std, partial_y_train)

In [10]:
# Predicting to evaluate
y_test_predicted = classifier_RFC.predict(X_test_std)
y_training_predicted = classifier_RFC.predict(X_train_std)

In [11]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
print("Training Accuracy:",metrics.accuracy_score(partial_y_train, y_training_predicted))
print("Testing Accuracy:",metrics.accuracy_score(y_test, y_test_predicted))

Training Accuracy: 0.9998333333333334
Testing Accuracy: 0.7198571428571429


In [12]:
# Confunsion matrix
cnf_matrix_training = metrics.confusion_matrix(partial_y_train, y_training_predicted)
cnf_matrix_testing = metrics.confusion_matrix(y_test,y_test_predicted)
print("The confunsion matrix results for the training part are:")
print(cnf_matrix_training)
print("The confunsion matrix results for the testing part are:")
print(cnf_matrix_testing)

The confunsion matrix results for the training part are:
[[20950     3]
 [    4 21043]]
The confunsion matrix results for the testing part are:
[[5192 1934]
 [1988 4886]]


In [13]:
# cross validation to analize the accuraccy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold_val = KFold(5)
cross_results = cross_val_score(classifier_RFC, X_train_std, partial_y_train, cv=kfold_val)

print(cross_results)
print(cross_results.mean())

[0.70988095 0.72309524 0.71190476 0.72107143 0.70238095]
0.7136666666666667


In [14]:
# In this case I found the first difference between SVC and RFC. It is RFC has a better accuracy in the training data
# But, in comparison with the test data is not the best. So here, we have overfitting. But, at the same time. 
# We have a better performance.

In [19]:
# Import gridsearchcv
from sklearn.model_selection import GridSearchCV

# Parameters grid
parameters={'n_estimators': [75, 100, 125, 150], 
            'max_features': ['sqrt', "log2"], 
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 4, 6, 8],
            'min_samples_leaf': [1, 2, 4, 6],
            'criterion': ['gini', 'entropy']}

# Building the hyperparametrizator
gridsearch=GridSearchCV(estimator = classifier_RFC,
                        param_grid = parameters,
                        scoring='accuracy',
                        cv=4,
                        n_jobs=-1)

In [20]:
# Fit the hyperpatametrization
gridsearch_RFC=gridsearch.fit(X_train_std, partial_y_train)

In [21]:
# Getting our accuracy score for the model
gridsearch_RFC.best_score_

0.7346666666666667

In [22]:
# Seeing our best parameters
gridsearch_RFC.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 75}

In [23]:
# Saving the best classifier
best_RFC = gridsearch_RFC.best_estimator_

In [24]:
# Evaluate the model with test data (The same I used to evaluate the model before cross validation)
final_test_precit = best_RFC.predict(X_test_std)

# Evaluate the model with validate data (This one the model has never seen so far)
final_val_precit = best_RFC.predict(X_val_std)

In [25]:
# The final metrics
print("Final test Accuracy:",metrics.accuracy_score(y_test, final_test_precit))
print("Final validate Accuracy:",metrics.accuracy_score(y_val, final_val_precit))

Final test Accuracy: 0.7430714285714286
Final validate Accuracy: 0.7332142857142857


In [None]:
# Conclusion
# The final result is better than SVC model, but The random forest is still not enough to get a good accuracy
# Probabily the problem is due to the low correlation among the features, so is hard for the model learn the real patterns.

In [26]:
# Save the model
import joblib
joblib.dump(best_RFC, './out/best_RFC_model.pkl')

['./out/best_RFC_model.pkl']