<a href="https://colab.research.google.com/github/Ismail-Amodu/Cross-Validation-In-Machine-Learning/blob/main/Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**COMPARING DIFFERENT MODELS**

In [1]:
# Importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
# Importing models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

**DATA COLLECTION AND PROCESSING**

In [3]:
# Loading the csv dataset into pandas dataframe
heart_data = pd.read_csv('/content/heart_disease.csv')

In [4]:
# priinting the first five rows of the dataframe
heart_data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,8,0,1,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,13,0,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,10,0,1,1,2,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,12,0,0,0,3,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,5,0,0,1,4,8.0,0,0,0


In [5]:
# priinting the last five rows of the dataframe
heart_data.tail()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
319790,1,27.41,1,0,0,7.0,0.0,1,1,9,1,1,0,2,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,4,1,0,1,4,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,6,1,0,1,3,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,0,2,1,0,0,3,12.0,0,0,0
319794,0,46.56,0,0,0,0.0,0.0,0,0,13,1,0,1,3,8.0,0,0,0


In [6]:
# getting the number of rows and column in the dataframe
heart_data.shape

(319795, 18)

In [7]:
# Getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  int64  
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  int64  
 3   AlcoholDrinking   319795 non-null  int64  
 4   Stroke            319795 non-null  int64  
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  int64  
 8   Sex               319795 non-null  int64  
 9   AgeCategory       319795 non-null  int64  
 10  Race              319795 non-null  int64  
 11  Diabetic          319795 non-null  int64  
 12  PhysicalActivity  319795 non-null  int64  
 13  GenHealth         319795 non-null  int64  
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  int64  
 16  KidneyDisease     31

In [8]:
# Checking for missing values
heart_data.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [9]:
# Obtaining the statistical measures of the data
heart_data.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,0.412477,0.068097,0.03774,3.37171,3.898366,0.13887,0.475273,7.514536,0.514048,0.194002,0.775362,3.595028,7.097075,0.134061,0.036833,0.093244
std,0.279766,6.3561,0.492281,0.251912,0.190567,7.95085,7.955235,0.345812,0.499389,3.564759,1.107419,0.496776,0.417344,1.042918,1.436007,0.340718,0.188352,0.290775
min,0.0,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,3.0,6.0,0.0,0.0,0.0
50%,0.0,27.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,1.0,4.0,7.0,0.0,0.0,0.0
75%,0.0,31.42,1.0,0.0,0.0,2.0,3.0,0.0,1.0,10.0,0.0,0.0,1.0,4.0,8.0,0.0,0.0,0.0
max,1.0,94.85,1.0,1.0,1.0,30.0,30.0,1.0,1.0,13.0,5.0,3.0,1.0,5.0,24.0,1.0,1.0,1.0


In [10]:
# Checking the distribution of the target variable to see if there is almost equal values
heart_data['HeartDisease'].value_counts()

HeartDisease
0    292422
1     27373
Name: count, dtype: int64

Note: 1 denotes defective heart and 0 denotes healthy heart

In [11]:
# Splitting features and target
X = heart_data.drop(columns='HeartDisease', axis=1)
Y = heart_data['HeartDisease']
print(X)
print(Y)

          BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  MentalHealth  \
0       16.60        1                0       0             3.0          30.0   
1       20.34        0                0       1             0.0           0.0   
2       26.58        1                0       0            20.0          30.0   
3       24.21        0                0       0             0.0           0.0   
4       23.71        0                0       0            28.0           0.0   
...       ...      ...              ...     ...             ...           ...   
319790  27.41        1                0       0             7.0           0.0   
319791  29.84        1                0       0             0.0           0.0   
319792  24.24        0                0       0             0.0           0.0   
319793  32.81        0                0       0             0.0           0.0   
319794  46.56        0                0       0             0.0           0.0   

        DiffWalking  Sex  A

In [12]:
# Splitting data into training data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=3)
print(X.shape, X_train.shape, X_test.shape)

(319795, 17) (255836, 17) (63959, 17)


**MODEL TRAINING: Comparing the models performances**

In [15]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]
def compare_models_train_test():
    for model in models:
        # Models Training
        model.fit(X_train, Y_train)

        # Models Evaluation
        test_data_prediction = model.predict(X_test)

        accuracy = accuracy_score(Y_test, test_data_prediction)

        print('Accuracy score of the ', model, '=', accuracy)

In [None]:
compare_models_train_test()

Accuracy score of the  LogisticRegression(max_iter=1000) = 0.9158210728748104


**CROSS VALIDATION**

Function that compares models

In [None]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [None]:
def compare_models_cross_validation():
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv=5)
        mean_accuracy = sum(cv_score)/len(cv_score)
        mean_accuracy = mean_accuracy*100
        mean_accuracy = round(mean_accuracy, 2)
        print('Cross Validation Accuracies for', model, '=', cv_score)
        print('Accuracy % of the', model, mean_accuracy)
        print('-------------------------------------------------------------')