# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# EDA

In [2]:
df = pd.read_csv('kaggle2.csv',index_col='id')
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


### Checking for Null values

In [3]:
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

### Dataset info

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15304 entries, 0 to 15303
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             15304 non-null  object 
 1   age                15304 non-null  float64
 2   hypertension       15304 non-null  int64  
 3   heart_disease      15304 non-null  int64  
 4   ever_married       15304 non-null  object 
 5   work_type          15304 non-null  object 
 6   Residence_type     15304 non-null  object 
 7   avg_glucose_level  15304 non-null  float64
 8   bmi                15304 non-null  float64
 9   smoking_status     15304 non-null  object 
 10  stroke             15304 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 1.4+ MB


### Label Encoding Categorical columns

In [5]:
from sklearn.preprocessing import LabelEncoder
cols =['gender','ever_married','work_type','Residence_type','smoking_status']
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

In [6]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,28.0,0,0,1,2,1,79.53,31.1,2,0
1,1,33.0,0,0,1,2,0,78.44,23.9,1,0
2,0,42.0,0,0,1,2,0,103.0,40.3,0,0
3,1,56.0,0,0,1,2,1,64.87,28.8,2,0
4,0,24.0,0,0,0,2,0,73.36,28.8,2,0


In [7]:
df.shape

(15304, 11)

### Splitting the dataset into train and test

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop('stroke',axis=1)
y = df['stroke']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((12243, 10), (3061, 10), (12243,), (3061,))

# Model Training

### Creating a baseline

In [9]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy='stratified',random_state=1)
dc.fit(X_train,y_train)
dc.score(X_test,y_test)

0.9238810846128717

### Testing other models to beat the baseline model

In [12]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=300 ,random_state=1)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9588369813786344

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.9578569095066971

In [15]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(random_state=1)
ab.fit(X_train,y_train)
ab.score(X_test,y_test)

0.9594903626265926

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
gbc.score(X_test,y_test)

0.9585102907546553

### As AdaBoostClassifier has the heighest accuracy among these models and beats the baseline model we will finetune AdaBoost

#### We will use RandomizedSearchCV

In [17]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

# Define the AdaBoostClassifier model
model = AdaBoostClassifier(random_state=1)

# Perform hyperparameter search using RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy')
random_search.fit(X, y)

best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score (Accuracy):", best_score)

Best Hyperparameters: {'n_estimators': 200, 'learning_rate': 0.1, 'algorithm': 'SAMME.R'}
Best Score (Accuracy): 0.958834376394574


# Hyperparameter tuned model

In [23]:
from sklearn.ensemble import AdaBoostClassifier
ab_tuned = AdaBoostClassifier(n_estimators=200,learning_rate=1,algorithm='SAMME.R',random_state=1)
ab_tuned.fit(X_train, y_train)
ab_tuned.score(X_test, y_test)

0.9601437438745508

# Evaluation metrics

In [29]:
from sklearn.metrics import classification_report
classification_report(y_test,ab_tuned.predict(X_test))

'              precision    recall  f1-score   support\n\n           0       0.96      1.00      0.98      2936\n           1       0.59      0.08      0.14       125\n\n    accuracy                           0.96      3061\n   macro avg       0.78      0.54      0.56      3061\nweighted avg       0.95      0.96      0.95      3061\n'