In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc

In [3]:
stroke=pd.read_csv('stroke.csv')
stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
stroke.shape

(5110, 11)

In [5]:
stroke.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [6]:
#checking the null values
stroke.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
stroke[stroke==0].count()

gender                  0
age                     0
hypertension         4612
heart_disease        4834
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                     0
smoking_status          0
stroke               4861
dtype: int64

In [8]:
#filling null values with the mean
stroke['bmi'].fillna(stroke['bmi'].mean(), inplace= True)
#filling null values with mode
stroke['smoking_status'].fillna(stroke['smoking_status'].mode()[0], inplace=True)

In [9]:
#checking the data
stroke.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [10]:
stroke.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.698018,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.8,0.0
50%,45.0,0.0,0.0,91.885,28.4,0.0
75%,61.0,0.0,0.0,114.09,32.8,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [11]:
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [12]:
# Get the counts of each unique value in the 'gender' column
gender_counts = stroke['gender'].value_counts()

# Display the count of females
num_females = gender_counts['Female']
print('Number of females:', num_females)

Number of females: 2994


In [13]:

# Display the count of males
num_males = gender_counts['Male']
print('Number of males:', num_males)

Number of males: 2115


In [14]:
#married females have more chances of heart stroke than married males
pd.pivot_table(stroke, index= 'stroke', columns='gender', values='ever_married', aggfunc= 'count')

gender,Female,Male,Other
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2853.0,2007.0,1.0
1,141.0,108.0,


In [15]:
#females with hypertension has more chance of heart stroke than males having hypertension problem
pd.pivot_table(stroke, index= 'stroke', columns='gender', values='hypertension', aggfunc= 'count')

gender,Female,Male,Other
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2853.0,2007.0,1.0
1,141.0,108.0,


In [16]:
#females with heart disease has more chances of stroke
pd.pivot_table(stroke, index= 'stroke', columns='gender', values='heart_disease', aggfunc= 'count')

gender,Female,Male,Other
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2853.0,2007.0,1.0
1,141.0,108.0,


In [17]:
#people having private jobs and has a habit of smoking has more chance of heart stroke 
pd.pivot_table(stroke, index= 'stroke', columns='work_type', values='smoking_status', aggfunc= 'count')

work_type,Govt_job,Never_worked,Private,Self-employed,children
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,624.0,22.0,2776.0,754.0,685.0
1,33.0,,149.0,65.0,2.0


In [18]:
#as age incraeses gender does not play any role in heart stroke
#can't say that marriage plays a role in heart stroke as people generally marry after the age of 25years
#with age glucose level increases which increases the chances of stroke

In [19]:
stroke['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [20]:
#ENCODING THE DATASETS
from sklearn import preprocessing 
encoder = preprocessing.LabelEncoder()
for i in stroke.columns:
    if isinstance(stroke[i][0], str):
        stroke[i] = encoder.fit_transform(stroke[i])

In [21]:
#STANDARDIZING
from sklearn.preprocessing import StandardScaler 
scalar = StandardScaler() 
scalar.fit(stroke) 
scaled_data = scalar.transform(stroke)

In [22]:
#creating a model
X= stroke.drop('stroke', axis=1)
X.shape

(5110, 10)

In [23]:
y= stroke['stroke']
y.shape

(5110,)

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
np.random.seed(0)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3, random_state = 100)

In [26]:
log = LogisticRegression()

In [27]:
log.fit(X_train,y_train)

In [28]:
log.score(X_train, y_train)

0.9496785015376014

In [29]:
#this model shows that it is overfit which is considering the high data so that we have to balance the data by some models(i.e) logistic regression

In [30]:
#to retain the original data, we craeted a copy of the dataset
stroke_copy= stroke.copy()

In [31]:
stroke_copy.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [32]:
#creating a list of data values which is more in number to make a balance data
li = list(stroke_copy[stroke_copy.stroke == 0].sample(n=2000).index)
stroke_copy['stroke'].value_counts() 

stroke
0    4861
1     249
Name: count, dtype: int64

In [33]:
stroke = pd.DataFrame(stroke)
stroke.reset_index()

Unnamed: 0,index,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,1,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,2,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,3,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,4,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,5105,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,5106,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,5107,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,5108,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [34]:
X = stroke.drop(['stroke'], axis=1).values 
y = stroke['stroke'].values
X.shape

(5110, 10)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=1000)


In [36]:
log.fit(X_train, y_train)

In [37]:
#the accuracy has dropped
log.score(X_test, y_test)

0.9478147423352903

In [38]:
#predicting the output with Logistic
y_underlog = log.predict(X)

In [39]:
print('The accuracy score of the model is:', accuracy_score(y,y_underlog)*100)
print('The F1 score of the model is:', f1_score(y, y_underlog)*100)
print('The recall score of the model is:', recall_score(y, y_underlog)*100)
print('The confusion matrix of the model is:', confusion_matrix(y, y_underlog))
print('The classification report of logistic model is:', classification_report(y, y_underlog))

The accuracy score of the model is: 95.14677103718199
The F1 score of the model is: 0.7999999999999998
The recall score of the model is: 0.4016064257028112
The confusion matrix of the model is: [[4861    0]
 [ 248    1]]
The classification report of logistic model is:               precision    recall  f1-score   support

           0       0.95      1.00      0.98      4861
           1       1.00      0.00      0.01       249

    accuracy                           0.95      5110
   macro avg       0.98      0.50      0.49      5110
weighted avg       0.95      0.95      0.93      5110



In [40]:
import pickle
fname='strokeNew.pkl'
pickle.dump(log,open(fname,'wb'))

In [41]:
load_model=pickle.load(open(fname,'rb'))

In [42]:
score=load_model.score(X_test,y_test)
score

0.9478147423352903