In [1]:
# On this notebook I'll build the SVC model 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Read the data
cardio_data=pd.read_csv("in/cardio_train.csv")
cardio_data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
cardio_data.groupby("cardio").size()
# The classes are already balanced

cardio
0    35021
1    34979
dtype: int64

In [4]:
# Split data into features and labels
X=cardio_data.iloc[:,1:12].values
y=cardio_data["cardio"].values

In [5]:
# Standarization and spliting process
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
len(X_test)


14000

In [7]:
# Also I'll split the dataset in val test, to have 3 sets from the initial one.
# X train = 56000 / X test = 14000 / X val = 14000
# With partial train I'll train the model, with test I'll improve the model and with test I'll evaluate the model
X_val = X_train[:14000]
partial_X_train = X_train[14000:]
X_test = X_test

y_val = y_train[:14000]
partial_y_train = y_train[14000:]
y_test = y_test

In [8]:
# Standarization of variables
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_train_std=std.fit_transform(partial_X_train)

In [9]:
# Remeber differences between fit and transform
X_test_std = std.transform(X_test)
X_val_std = std.transform(X_val)

In [10]:
len(partial_X_train)

42000

In [11]:
# Applying the SVC
from sklearn.svm import SVC
classifier_SVC = SVC().fit(X_train_std, partial_y_train)

In [12]:
# Predicting to evaluate
y_test_predicted = classifier_SVC.predict(X_test_std)
y_training_predicted = classifier_SVC.predict(X_train_std)

In [13]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
print("Training Accuracy:",metrics.accuracy_score(partial_y_train, y_training_predicted))
print("Testing Accuracy:",metrics.accuracy_score(y_test, y_test_predicted))

Training Accuracy: 0.7318095238095238
Testing Accuracy: 0.7252857142857143


In [14]:
# Confunsion matrix
cnf_matrix_training = metrics.confusion_matrix(partial_y_train, y_training_predicted)
cnf_matrix_testing = metrics.confusion_matrix(y_test,y_test_predicted)
print("The confunsion matrix results for the training part are:")
print(cnf_matrix_training)
print("The confunsion matrix results for the testing part are:")
print(cnf_matrix_testing)

The confunsion matrix results for the training part are:
[[16216  4874]
 [ 6390 14520]]
The confunsion matrix results for the testing part are:
[[5298 1658]
 [2188 4856]]


In [15]:
# cross validation to analize the accuraccy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold_val = KFold(4)
cross_results = cross_val_score(classifier_SVC, X_train_std, partial_y_train, cv=kfold_val)

print(cross_results)
print(cross_results.mean())

[0.72095238 0.72542857 0.7232381  0.72152381]
0.7227857142857143


In [16]:
# The model has a similar accuracy for the 10 cross validation sets. 
# This could be for the strong resistant that SVC model has to overfitting.
# Next, It's time to implement hyperparameterization process to get the best version of the model. 
# And later, compare it with our RFC.

In [17]:
# Import gridsearchcv
from sklearn.model_selection import GridSearchCV

In [18]:
# Parameters 
parameters={'C': [0.01, 0.1, 1], 
            'gamma': [0.01, 0.1, 1], 
            'kernel': ['rbf',"linear"]}

In [19]:
# Building the hyperparametrization
gridsearch=GridSearchCV(estimator = classifier_SVC,
                        param_grid = parameters,
                        scoring='accuracy',
                        cv=4,
                        n_jobs=-1)

In [20]:
# Fit the hyperpatametrization
gridsearch_SVC=gridsearch.fit(X_train_std, partial_y_train)

In [21]:
# Getting our accuracy score for the model
accuracy=gridsearch_SVC.best_score_
print(accuracy)

0.7229761904761904


In [22]:
# Seeing our best parameters
gridsearch_SVC.best_params_

{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}

In [23]:
# Saving the best classifier
best_SVC = gridsearch_SVC.best_estimator_

In [25]:
# The final one
classifier_SVC_F = SVC(C = 1, gamma=0.01, kernel="linear").fit(X_train_std, partial_y_train)

In [31]:
# Evaluate the model with test data (The same I used to evaluate the model before cross validation)
final_test_precit = best_SVC.predict(X_test_std)

In [32]:
# Evaluate the model with validate data (This one the model has never seen so far)
final_val_precit = best_SVC.predict(X_val_std)

In [33]:
# The final metrics
print("Final test Accuracy:",metrics.accuracy_score(y_test, final_test_precit))
print("Final validate Accuracy:",metrics.accuracy_score(y_val, final_val_precit))

Training Accuracy: 0.7159285714285715
Testing Accuracy: 0.7147857142857142


In [34]:
# Conclusion
# Since the cross validation part I could see as the model had an accuracy around 72%. 
# It shows that the model is really strong to avoid overfitting for itself.
# The idea to create a gridsearchcv was to undertand deeply the way of it works.
# Next I'll build and train a RFC in the RFclassifier.ipynb and compare which one is the best one.

In [35]:
# Save the model
import joblib
joblib.dump(best_SVC, './out/best_SVC_model.pkl')

['./out/best_SVC_model.pkl']