In [1]:
#!pip install catboost

In [2]:
#!pip install scikit-plot
#!pip install seaborn
#!pip install sklearn
#!pip install scikit-learn

In [3]:
#!pip install imblearn

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
sns.set()
sns.set_context('notebook', font_scale= 1.2)

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier



from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, classification_report
import scikitplot as skplt

from imblearn.over_sampling import RandomOverSampler

In [5]:
data = pd.read_csv('D:\Sem 7\healthcare-dataset-stroke-data.csv')
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


First thing I notice is the id column and NaN value in the BMI column. I don't think ID will be useful in our analysis, so I will just drop it. 

In [6]:
data.drop('id', axis= 1, inplace= True)

In [7]:
data.shape

(5110, 11)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


We can see that Hypertension, heart disease and stroke have an integer data type but we know that they are categorical variables. So I will convert them to object datatype.

In [9]:
data[['hypertension', 'heart_disease', 'stroke']] = data[['hypertension', 'heart_disease', 'stroke']].astype(str)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   object 
dtypes: float64(3), object(8)
memory usage: 439.3+ KB


In [11]:
data.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,4909.0
mean,43.226614,106.147677,28.893237
std,22.612647,45.28356,7.854067
min,0.08,55.12,10.3
25%,25.0,77.245,23.5
50%,45.0,91.885,28.1
75%,61.0,114.09,33.1
max,82.0,271.74,97.6


It seems like we have some outliers in avg_glucose_level and bmi column. So lets construct a boxplot of our numerical variable to check for outliers

In [12]:
for col in ['avg_glucose_level', 'bmi']:
    data[col] = np.log(data[col])

In [13]:
data.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,4909.0
mean,43.226614,4.592465,3.328423
std,22.612647,0.361985,0.265064
min,0.08,4.009513,2.332144
25%,25.0,4.346982,3.157
50%,45.0,4.520538,3.33577
75%,61.0,4.736988,3.499533
max,82.0,5.604846,4.580877


Now lets try to visualise the unique values we have in our categorical features.

The number of people who actually had a stroke are very less in our dataset. We will have to keep that in mind when we split our dataset

One thing I did notice is we have a **Other** category in Gender column. I am just going to drop it for simplicity sake.

In [14]:
data['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [15]:
data.drop(data[data['gender'] == 'Other'].index, inplace= True)

Now let us try and visualise our categorical features based on our target variable

One thing I do notice is urban people have more strokes as compared to people living in rural areas

## **Filling Missing Values**

In [16]:
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

We only have missing values in the BMI column. I will do knn imputation to fill those missing values

In [17]:
def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [18]:
data1 = knn_impute(data, 'bmi')

In [19]:
data1.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

## **Encoding**

In [20]:
data2 = pd.get_dummies(data1, drop_first= True)

In [21]:
data2.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke_1
0,67.0,5.432367,3.600048,1,0,1,1,0,1,0,0,1,1,0,0,1
1,61.0,5.309307,3.535493,0,0,0,1,0,0,1,0,0,0,1,0,1
2,80.0,4.662684,3.48124,1,0,1,1,0,1,0,0,0,0,1,0,1
3,49.0,5.143008,3.538057,0,0,0,1,0,1,0,0,1,0,0,1,1
4,79.0,5.159745,3.178054,0,1,0,1,0,0,1,0,0,0,1,0,1


## **Scaling**

In [22]:
s = StandardScaler()
data2[['bmi', 'avg_glucose_level', 'age']] = s.fit_transform(data2[['bmi', 'avg_glucose_level', 'age']])

In [23]:
data3 = data2.copy()
data3.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke_1
0,1.051242,2.320709,1.027679,1,0,1,1,0,1,0,0,1,1,0,0,1
1,0.785889,1.980714,0.781547,0,0,0,1,0,0,1,0,0,0,1,0,1
2,1.626174,0.194204,0.574693,1,0,1,1,0,1,0,0,0,0,1,0,1
3,0.255182,1.521257,0.79132,0,0,0,1,0,1,0,0,1,0,0,1,1
4,1.581949,1.567499,-0.581283,0,1,0,1,0,0,1,0,0,0,1,0,1


It seems like our data is ready for modelling. Lets split our dataset into train and test set

## **Data Splitting**

In [24]:
oversample = RandomOverSampler(sampling_strategy='not majority')
X=data3.drop(['stroke_1'],axis=1)
y=data3['stroke_1']
X_over, y_over = oversample.fit_resample(X, y)

This oversampling step was inspired from Harshit Gupta's notebook. 
[Click Here](https://www.kaggle.com/code/casper6290/strokeprediction-99-acc) to check out the notebook

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size= 0.2, random_state= 42)

In [26]:
print(y_train.value_counts())
print(y_test.value_counts())

1    3892
0    3884
Name: stroke_1, dtype: int64
0    976
1    968
Name: stroke_1, dtype: int64


In [27]:
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score

In [28]:
#X_test.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
251,-1.558066,0.320893,-1.763829,0,0,0,0,0,1,0,0,1,0,0,0
4961,0.166731,0.214216,0.419048,1,0,0,1,0,1,0,0,1,0,0,0
3465,0.255182,-0.267789,-0.034615,0,0,0,1,0,0,0,0,0,0,1,0
2755,-1.779193,-0.304412,-1.742227,1,0,0,0,0,0,0,1,1,0,0,0
6963,0.785889,0.354325,1.110121,1,1,1,1,0,1,0,0,1,0,0,1


In [29]:
rf = RandomForestClassifier(n_estimators= 100)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
y_pred_prob_rf = rf.predict_proba(X_test)[:, 1]


print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print(f'Precission_score: {precision_score(y_test,y_pred_rf)}')
print(f'Recall_score: {recall_score(y_test,y_pred_rf)}')
print(f'F1-score: {f1_score(y_test,y_pred_rf)}')


Accuracy: 0.992798353909465
Precission_score: 0.9857433808553971
Recall_score: 1.0
F1-score: 0.9928205128205128


In [30]:
#X_test.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender_Male,hypertension_1,heart_disease_1,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
251,-1.558066,0.320893,-1.763829,0,0,0,0,0,1,0,0,1,0,0,0
4961,0.166731,0.214216,0.419048,1,0,0,1,0,1,0,0,1,0,0,0
3465,0.255182,-0.267789,-0.034615,0,0,0,1,0,0,0,0,0,0,1,0
2755,-1.779193,-0.304412,-1.742227,1,0,0,0,0,0,0,1,1,0,0,0
6963,0.785889,0.354325,1.110121,1,1,1,1,0,1,0,0,1,0,0,1


In [32]:
#import pickle
#import joblib
#filename = 'brain_stroke2.pkl'
#joblib.dump(rf, filename)

In [69]:
#||||||||||||||||||||||||||||||||IMPORTANT|||||||||||||||||||||||||||||
#||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# dict = {
#     'age' : 79,
#     'avg_glucose_level' : 174.23,
#     'bmi' : 24,
#     'gender_Male' : 0,
#     'hypertension_1' : 1,
#     'heart_disease_1' : 0,
#     'ever_married_Yes' : 1,
#     'work_type_Never_worked' : 0,
#     'work_type_Private' : 0,
#     'work_type_Self-employed' : 1,
#     'work_type_children' : 0,
#     'Residence_type_Urban' : 0,
#     'smoking_status_formerly smoked' : 0,
#     'smoking_status_never smoked' : 1,
#     'smoking_status_smokes' : 0

# }

# df = pd.DataFrame(dict, index=[0])
# for col in ['avg_glucose_level', 'bmi']:
#     df[col] = np.log(df[col])
# mean_age = 43.226614
# std_age = 22.612647
# mean_glucose = 4.592465
# std_glucose = 0.361985
# mean_bmi = 3.328423
# std_bmi = 0.265064
# df['age'] = (df['age'] - mean_age)/(std_age)
# df['avg_glucose_level'] = (df['avg_glucose_level'] - mean_glucose)/(std_glucose)
# df['bmi'] = (df['bmi'] - mean_bmi)/(std_bmi)
# y_pred = rf.predict(df)
# print(y_pred)

[1]
