In [13]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import os

In [14]:
all_stroke_data = 'dataset/healthcare-dataset-stroke-data.csv'
stroke_df = pd.read_csv(all_stroke_data)
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [15]:
# drop rows with empty data points
clean_file = stroke_df.dropna(how = 'any')
clean_file = clean_file[clean_file.gender != "Other"]
clean_file = clean_file[clean_file.smoking_status != "Unknown"]
clean_file = clean_file[clean_file.age >= 32]
clean_file = clean_file[clean_file.bmi >= 24]
clean_file = clean_file[clean_file.bmi <= 42]
clean_file = clean_file.drop(columns=['work_type', 'Residence_type'])
clean_file 

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...
5096,41512,Male,57.0,0,0,Yes,76.62,28.2,never smoked,0
5100,68398,Male,82.0,1,0,Yes,71.97,28.3,never smoked,0
5106,44873,Female,81.0,0,0,Yes,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,82.99,30.6,never smoked,0


In [16]:
# gave columns more meaningful names
clean_file = clean_file.rename(columns={"id":"ID", "gender":"Gender", "age":"Age", "hypertension":"Hypertension", "heart_disease":"HeartDisease", "ever_married":"EverMarried", "work_type":"WorkType", "Residence_type":"ResidenceType", "avg_glucose_level":"AvgGlucoseLevel", "bmi":"BMI", "smoking_status":"SmokingStatus", "stroke":"Stroke"})
clean_file

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,AvgGlucoseLevel,BMI,SmokingStatus,Stroke
0,9046,Male,67.0,0,1,Yes,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...
5096,41512,Male,57.0,0,0,Yes,76.62,28.2,never smoked,0
5100,68398,Male,82.0,1,0,Yes,71.97,28.3,never smoked,0
5106,44873,Female,81.0,0,0,Yes,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,82.99,30.6,never smoked,0


In [17]:
numbered_file = clean_file
Gender = {'Male': 1,'Female': 2}
numbered_file.Gender = [Gender[item] for item in numbered_file.Gender]
EverMarried = {'Yes': 1,'No': 0}
numbered_file.EverMarried = [EverMarried[item] for item in numbered_file.EverMarried]
SmokingStatus = {'never smoked': 1,'formerly smoked': 2, 'smokes': 3}
numbered_file.SmokingStatus = [SmokingStatus[item] for item in numbered_file.SmokingStatus]
numbered_file

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,AvgGlucoseLevel,BMI,SmokingStatus,Stroke
0,9046,1,67.0,0,1,1,228.69,36.6,2,1
2,31112,1,80.0,0,1,1,105.92,32.5,1,1
3,60182,2,49.0,0,0,1,171.23,34.4,3,1
4,1665,2,79.0,1,0,1,174.12,24.0,1,1
5,56669,1,81.0,0,0,1,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...
5096,41512,1,57.0,0,0,1,76.62,28.2,1,0
5100,68398,1,82.0,1,0,1,71.97,28.3,1,0
5106,44873,2,81.0,0,0,1,125.20,40.0,1,0
5107,19723,2,35.0,0,0,1,82.99,30.6,1,0


In [18]:
#clean_file.to_csv(r'C:\Users\garre\OneDrive\Desktop\strokepredictor\dataset\cleanedstrokedata.csv', encoding='utf-8', index=False)

In [19]:
y = numbered_file["Stroke"]
target_names = ["negative", "positive"]

In [20]:
X = numbered_file.drop("Stroke", axis=1)
X.head()

Unnamed: 0,ID,Gender,Age,Hypertension,HeartDisease,EverMarried,AvgGlucoseLevel,BMI,SmokingStatus
0,9046,1,67.0,0,1,1,228.69,36.6,2
2,31112,1,80.0,0,1,1,105.92,32.5,1
3,60182,2,49.0,0,0,1,171.23,34.4,3
4,1665,2,79.0,1,0,1,174.12,24.0,1
5,56669,1,81.0,0,0,1,186.21,29.0,2


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [23]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.929


In [24]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.94      0.99      0.96       496
    positive       0.33      0.06      0.10        36

    accuracy                           0.93       532
   macro avg       0.63      0.52      0.53       532
weighted avg       0.89      0.93      0.90       532

