In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,61250,21139,2,165,90.0,150,100,3,2,0,0,1,1
1,8444,15946,2,173,70.0,120,80,1,1,0,0,1,1
2,47236,21255,1,148,67.0,130,80,1,1,0,0,1,0
3,49559,22739,1,155,52.0,120,80,1,1,0,0,1,1
4,84574,16761,1,164,68.0,110,70,1,1,0,0,1,1


In [4]:
X = df.drop('cardio',axis=1)  
y = df['cardio'] 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [7]:
y_pred = rf_classifier.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.726031746031746


In [9]:
df.shape

(63000, 13)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63000 entries, 0 to 62999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           63000 non-null  int64  
 1   age          63000 non-null  int64  
 2   gender       63000 non-null  int64  
 3   height       63000 non-null  int64  
 4   weight       63000 non-null  float64
 5   ap_hi        63000 non-null  int64  
 6   ap_lo        63000 non-null  int64  
 7   cholesterol  63000 non-null  int64  
 8   gluc         63000 non-null  int64  
 9   smoke        63000 non-null  int64  
 10  alco         63000 non-null  int64  
 11  active       63000 non-null  int64  
 12  cardio       63000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.2 MB


In [11]:
df.isnull()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62995,False,False,False,False,False,False,False,False,False,False,False,False,False
62996,False,False,False,False,False,False,False,False,False,False,False,False,False
62997,False,False,False,False,False,False,False,False,False,False,False,False,False
62998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [12]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [13]:
print('Train Accuracy: ', clf.score(X_train,y_train))
print('Test Accuracy: ', clf.score(X_test,y_test))

Train Accuracy:  1.0
Test Accuracy:  0.7257936507936508


In [14]:
from sklearn.model_selection import StratifiedKFold

X = df.drop('cardio', axis=1).values
y = df['cardio'].values

kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    
clf.fit(X_train,y_train)

print('Train Accuracy: ', clf.score(X_train,y_train))
print('Test Accuracy: ', clf.score(X_test,y_test))

Train Accuracy:  1.0
Test Accuracy:  0.7183333333333334


In [38]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 20]}

rand_forest = RandomForestClassifier()

grid_search = GridSearchCV(clf, param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print('Best Parameters: ', best_params)

Best Parameters:  {'max_depth': 10, 'n_estimators': 200}


In [39]:
best_rand_forest = RandomForestClassifier(**best_params)

best_rand_forest.fit(X_train, y_train)

print('Train Accuracy: ', best_rand_forest.score(X_train,y_train))
print('Test Accuracy: ', best_rand_forest.score(X_test,y_test))

Train Accuracy:  0.7574206349206349
Test Accuracy:  0.7296825396825397
