In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [7]:
df = pd.read_csv("/content/stroke-data.csv")
df.head()      # displayes some of the rows from the top

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [8]:
df.shape

(5110, 12)

In [9]:
df.dropna(axis=0,inplace = True) # We are droppin rows with nulls values
df = df.drop('id', axis = 1)

In [10]:
df.drop(['Residence_type'], axis = 1, inplace = True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,186.21,29.0,formerly smoked,1


In [11]:
gender = pd.get_dummies(df['gender'])   # one hot encoding
gender.drop(['Male'], axis = 1, inplace = True)
df.drop('gender',axis = 1, inplace = True)
df = pd.concat([gender,df], axis = 1)
df.head()

Unnamed: 0,Female,Other,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,0,67.0,0,1,Yes,Private,228.69,36.6,formerly smoked,1
2,0,0,80.0,0,1,Yes,Private,105.92,32.5,never smoked,1
3,1,0,49.0,0,0,Yes,Private,171.23,34.4,smokes,1
4,1,0,79.0,1,0,Yes,Self-employed,174.12,24.0,never smoked,1
5,0,0,81.0,0,0,Yes,Private,186.21,29.0,formerly smoked,1


In [12]:
df[['ever_married', 'work_type', 'smoking_status']] = df[['ever_married', 'work_type', 'smoking_status']].astype("category")

In [13]:
def oneHot(name, X, output_features, dropFirst):
  dataframe = df[name]
  encoded = pd.get_dummies(dataframe, drop_first = dropFirst)
  print(encoded.head())
  encoded.columns = output_features
  X.drop(name, axis = 1, inplace = True)
  X = pd.concat([encoded,X], axis  = 1)
  return X

In [14]:
featureCol = df['ever_married'].cat.categories
print(featureCol)

Index(['No', 'Yes'], dtype='object')


In [15]:
df = oneHot('ever_married',df,['Married'],True)
df.head()

   Yes
0    1
2    1
3    1
4    1
5    1


Unnamed: 0,Married,Female,Other,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,0,0,67.0,0,1,Private,228.69,36.6,formerly smoked,1
2,1,0,0,80.0,0,1,Private,105.92,32.5,never smoked,1
3,1,1,0,49.0,0,0,Private,171.23,34.4,smokes,1
4,1,1,0,79.0,1,0,Self-employed,174.12,24.0,never smoked,1
5,1,0,0,81.0,0,0,Private,186.21,29.0,formerly smoked,1


In [16]:
featureCol = df['work_type'].cat.categories[1:]
df = oneHot('work_type',df,featureCol,True)

   Never_worked  Private  Self-employed  children
0             0        1              0         0
2             0        1              0         0
3             0        1              0         0
4             0        0              1         0
5             0        1              0         0


In [17]:
df.head()

Unnamed: 0,Never_worked,Private,Self-employed,children,Married,Female,Other,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,0,1,0,0,1,0,0,67.0,0,1,228.69,36.6,formerly smoked,1
2,0,1,0,0,1,0,0,80.0,0,1,105.92,32.5,never smoked,1
3,0,1,0,0,1,1,0,49.0,0,0,171.23,34.4,smokes,1
4,0,0,1,0,1,1,0,79.0,1,0,174.12,24.0,never smoked,1
5,0,1,0,0,1,0,0,81.0,0,0,186.21,29.0,formerly smoked,1


In [19]:
featureCol = df['smoking_status'].cat.categories
print(featureCol)
df = oneHot('smoking_status',df,featureCol,False)

Index(['Unknown', 'formerly smoked', 'never smoked', 'smokes'], dtype='object')
   Unknown  formerly smoked  never smoked  smokes
0        0                1             0       0
2        0                0             1       0
3        0                0             0       1
4        0                0             1       0
5        0                1             0       0


In [20]:
df.head()

Unnamed: 0,Unknown,formerly smoked,never smoked,smokes,Never_worked,Private,Self-employed,children,Married,Female,Other,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,1,0,0,67.0,0,1,228.69,36.6,1
2,0,0,1,0,0,1,0,0,1,0,0,80.0,0,1,105.92,32.5,1
3,0,0,0,1,0,1,0,0,1,1,0,49.0,0,0,171.23,34.4,1
4,0,0,1,0,0,0,1,0,1,1,0,79.0,1,0,174.12,24.0,1
5,0,1,0,0,0,1,0,0,1,0,0,81.0,0,0,186.21,29.0,1


In [21]:
df = df[df['Unknown'] == 0]

In [22]:
df.head()

Unnamed: 0,Unknown,formerly smoked,never smoked,smokes,Never_worked,Private,Self-employed,children,Married,Female,Other,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,1,0,0,0,1,0,0,1,0,0,67.0,0,1,228.69,36.6,1
2,0,0,1,0,0,1,0,0,1,0,0,80.0,0,1,105.92,32.5,1
3,0,0,0,1,0,1,0,0,1,1,0,49.0,0,0,171.23,34.4,1
4,0,0,1,0,0,0,1,0,1,1,0,79.0,1,0,174.12,24.0,1
5,0,1,0,0,0,1,0,0,1,0,0,81.0,0,0,186.21,29.0,1


In [24]:
df.shape

(3426, 17)

In [26]:
df.drop(['Unknown', 'formerly smoked'], axis=1, inplace = True)

In [27]:
df.head()

Unnamed: 0,never smoked,smokes,Never_worked,Private,Self-employed,children,Married,Female,Other,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,0,0,1,0,0,1,0,0,67.0,0,1,228.69,36.6,1
2,1,0,0,1,0,0,1,0,0,80.0,0,1,105.92,32.5,1
3,0,1,0,1,0,0,1,1,0,49.0,0,0,171.23,34.4,1
4,1,0,0,0,1,0,1,1,0,79.0,1,0,174.12,24.0,1
5,0,0,0,1,0,0,1,0,0,81.0,0,0,186.21,29.0,1


In [28]:
df['bmi'].describe()

count    3426.000000
mean       30.290047
std         7.295958
min        11.500000
25%        25.300000
50%        29.100000
75%        34.100000
max        92.000000
Name: bmi, dtype: float64

In [29]:
df['age'].describe()
df['age'] /= 100

In [31]:
standard = StandardScaler()
standardized_features = standard.fit_transform(df.iloc[:, [12,13]])
df.iloc[:, 12:14] = standardized_features
df.head()

Unnamed: 0,never smoked,smokes,Never_worked,Private,Self-employed,children,Married,Female,Other,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,0,0,0,1,0,0,1,0,0,0.67,0,1,2.523621,0.864982,1
2,1,0,0,1,0,0,1,0,0,0.8,0,1,-0.050358,0.302945,1
3,0,1,0,1,0,0,1,1,0,0.49,0,0,1.318923,0.563401,1
4,1,0,0,0,1,0,1,1,0,0.79,1,0,1.379514,-0.862253,1
5,0,0,0,1,0,0,1,0,0,0.81,0,0,1.632992,-0.176842,1


In [33]:
X = df.drop('stroke', axis = 1)
y = df['stroke']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
model = LogisticRegression()

In [36]:
model.fit(X_train,y_train)

In [37]:
predictions = model.predict(X_test)

In [38]:
cnf_matrix = confusion_matrix(y_test, predictions)
cnf_matrix

array([[969,   0],
       [ 59,   0]])

In [39]:
score = accuracy_score(y_test,predictions)
print(score)

0.9426070038910506
