# Stroke prediction using model SVM

### Importing libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

### Importing Dataset

In [2]:
df = pd.read_csv("D:\\Learn\\Uni\\ML\\CapstoneProject\\data\\healthcare-dataset-stroke-data.csv")

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5104 entries, 0 to 5103
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5104 non-null   int64  
 1   gender             5104 non-null   object 
 2   age                5104 non-null   float64
 3   hypertension       5104 non-null   int64  
 4   heart_disease      5104 non-null   int64  
 5   ever_married       5104 non-null   object 
 6   work_type          5104 non-null   object 
 7   Residence_type     5104 non-null   object 
 8   avg_glucose_level  5104 non-null   float64
 9   bmi                4903 non-null   float64
 10  smoking_status     5104 non-null   object 
 11  stroke             5104 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 478.6+ KB


### Filling null values

In [5]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5104.0,5104.0,5104.0,5104.0,5104.0,4903.0,5104.0
mean,36496.701215,43.213754,0.097375,0.054075,106.13758,28.891862,0.048393
std,21161.207853,22.621771,0.296496,0.226188,45.282155,7.858238,0.214617
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17737.5,25.0,0.0,0.0,77.2375,23.5,0.0
50%,36898.5,45.0,0.0,0.0,91.865,28.0,0.0
75%,54625.75,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [6]:
df['bmi'].fillna(df['bmi'].mean(), inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5104 entries, 0 to 5103
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5104 non-null   int64  
 1   gender             5104 non-null   object 
 2   age                5104 non-null   float64
 3   hypertension       5104 non-null   int64  
 4   heart_disease      5104 non-null   int64  
 5   ever_married       5104 non-null   object 
 6   work_type          5104 non-null   object 
 7   Residence_type     5104 non-null   object 
 8   avg_glucose_level  5104 non-null   float64
 9   bmi                5104 non-null   float64
 10  smoking_status     5104 non-null   object 
 11  stroke             5104 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 478.6+ KB


### Drop data unnecessary columms

In [8]:
df = df.drop('id', axis=1)

In [9]:
from sklearn.calibration import LabelEncoder


le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

### Dividing data into x and y

In [10]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

### Handle imbalance dataset

In [11]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

### Splitting dataset into training and testing data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state=1)

# Training

In [14]:
from sklearn.svm import SVC

In [15]:
model = SVC(kernel='rbf', C=3, gamma=0.2)

In [16]:
model.fit(x_train, y_train)

In [17]:
y_pred = model.predict(x_test)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       964
           1       0.96      0.97      0.97       979

    accuracy                           0.97      1943
   macro avg       0.97      0.96      0.96      1943
weighted avg       0.97      0.97      0.97      1943



In [20]:
print("Training accuracy on XGBoost: ", model.score(x_train, y_train)*100)

Training accuracy on XGBoost:  99.90992150302407


In [21]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

cm = confusion_matrix(y_test, model.predict(x_test))
print("Confusion Matrix: \n", cm)
print("Model testing accuracy:", ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]))*100, "%", sep=' ')
print(f'Precision Score: {precision_score(y_test, model.predict(x_test))*100:.2f}%')
print(f'Recall Score: {recall_score(y_test, model.predict(x_test))*100:.2f}%')
print(f'F1 Score: {f1_score(y_test, model.predict(x_test))*100:.2f}%')
print()

Confusion Matrix: 
 [[927  37]
 [ 31 948]]
Model testing accuracy: 96.50025733401955 %
Precision Score: 96.24%
Recall Score: 96.83%
F1 Score: 96.54%

