In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as px
%matplotlib inline

In [2]:
stroke_data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
stroke_data.shape

(5110, 12)

In [7]:
stroke_data['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [8]:
stroke_data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [9]:
column_name = 'bmi'
Average = stroke_data[column_name].mean()
stroke_data[column_name].fillna(Average, inplace=True)

In [10]:
print(stroke_data[column_name].isnull().sum())

0


In [11]:
stroke_data['gender'].replace({
    "Male":1,
    "Female":0,
     "Other":2}, inplace=True)

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
stroke_data['smoking_status'] = encoder.fit_transform(stroke_data['smoking_status'])

# 0 - NO STROKE
 
# 1 - STROKE 

In [13]:
X = stroke_data.drop(columns=['stroke','work_type','Residence_type', 'ever_married'], axis=1)
Y = stroke_data['stroke']

In [14]:
X


Unnamed: 0,id,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status
0,9046,1,67.0,0,1,228.69,36.600000,1
1,51676,0,61.0,0,0,202.21,28.893237,2
2,31112,1,80.0,0,1,105.92,32.500000,2
3,60182,0,49.0,0,0,171.23,34.400000,3
4,1665,0,79.0,1,0,174.12,24.000000,2
...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,83.75,28.893237,2
5106,44873,0,81.0,0,0,125.20,40.000000,2
5107,19723,0,35.0,0,0,82.99,30.600000,2
5108,37544,1,51.0,0,0,166.29,25.600000,1


In [15]:
Y

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()

In [20]:
scaler.fit(X)

In [21]:
Standardized_data = scaler.transform(X)

In [22]:
X = Standardized_data

In [23]:
from sklearn.svm import SVC

In [24]:
svm = SVC(kernel='linear', random_state=42)

In [25]:
svm.fit(X_train, Y_train)

In [27]:
y_pred = svm.predict(X_test)

In [28]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [34]:
accuracy_score(Y_test, y_pred)

0.9324853228962818

In [35]:
confusion_matrix(Y_test, y_pred)

array([[952,   8],
       [ 61,   1]], dtype=int64)

In [32]:
import pickle as pkl

In [36]:
filename = 'Stroke_pred_model.pkl'
pkl.dump(svm, open(filename, 'wb'))