    Predict the Stroke in the given data set and apply all available kernels in SVC model
    1- prepare the data set according to need (numeric)
    2- let us know the which kernel is best for such application

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split ,GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [12]:
data = pd.read_csv("../DataSets/healthcare-dataset-stroke-data.csv")
data.drop("id",axis=1,inplace=True)
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
## check for missing values to Clean and preprocess data

In [13]:
print("Duplicated Values",data.duplicated().sum())
data.fillna(data["bmi"].mean(),inplace = True)

Duplicated Values 0


In [14]:
df = data.dropna()
#now check for duplicates
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
df["stroke"].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [16]:
data.head(1)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1


In [17]:
# Separate features and target
X = df.drop("stroke", axis=1)
Y = df["stroke"]

category = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
numerical = ["age", "avg_glucose_level", "bmi"]

categ_trans = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown='ignore'))])
numeri_trans = Pipeline(steps=[("scaling", StandardScaler())])

preprocessor = ColumnTransformer([
    ("numerical", numeri_trans, numerical),
    ("category", categ_trans, category)
], remainder="passthrough")

# Apply oversampling to handle imbalanced data
oversampler = RandomOverSampler(random_state=42)
X_resampled, Y_resampled = oversampler.fit_resample(X, Y)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("estimator", SVC())
])

x_train, x_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42)
pipeline.fit(x_train, y_train)

# Check accuracy and F1 score
print("Accuracy:", pipeline.score(x_test, y_test))
y_preds = pipeline.predict(x_test)
print("F1 Score:", f1_score(y_test, y_preds))




Accuracy: 0.8457583547557841
F1 Score: 0.85781990521327


In [50]:
params = {
    "estimator__kernel": ["rbf","linear","poly","sigmoid"],
    "estimator__C":[1,10],
    "estimator__gamma":["scale","auto"]
}

best_grid = GridSearchCV(pipeline , params ,cv = 5 ,verbose=2)
best_grid.fit(x_train ,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=rbf; total time=   2.0s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=rbf; total time=   2.0s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=rbf; total time=   1.8s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=rbf; total time=   1.9s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=rbf; total time=   1.8s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=linear; total time=   1.7s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=linear; total time=   1.6s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=linear; total time=   1.6s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=linear; total time=   1.8s
[CV] END estimator__C=1, estimator__gamma=scale, estimator__kernel=linear; total time=   1.6s
[CV] END estim

In [54]:
model = best_grid.best_estimator_
# Check accuracy and F1 score
print("Accuracy:", model.score(x_test, y_test))
y_preds = model.predict(x_test)
print("F1 Score:", f1_score(y_test, y_preds))

Accuracy: 0.9148936170212766
F1 Score: 0.9211822660098522


In [38]:
Y_resampled.value_counts()

1    4700
0    4700
Name: stroke, dtype: int64

In [39]:
# Prompt the user for input
gender = input("Enter the Gender: ")
age = float(input("Enter the Age: "))
hypertension = int(input("Hypertension (1 for Yes, 0 for No): "))
heart_disease = int(input("Heart Disease (1 for Yes, 0 for No): "))
ever_married = input("Ever Married (yes/no): ")
work_type = input("Enter the Work Type: ")
residence_type = input("Residence Type (urban/rural): ")
avg_glucose_level = float(input("Enter the Average Glucose Level: "))
bmi = float(input("Enter the BMI: "))
smoking_status = input("Smoking Status: ")


# Create a DataFrame with the user input
data = pd.DataFrame({
    'gender': [gender],
    'age': [age],
    'hypertension': [hypertension],
    'heart_disease': [heart_disease],
    'ever_married': [ever_married],
    'work_type': [work_type],
    'Residence_type': [residence_type],
    'avg_glucose_level': [avg_glucose_level],
    'bmi': [bmi],
    'smoking_status': [smoking_status]
    
})

# Predict the data
transformed_data = pipeline["preprocessor"].transform(data)
prediction = pipeline["estimator"].predict(transformed_data)

print("The Label is:", prediction[0])

Enter the Gender: Male
Enter the Age: 67
Hypertension (1 for Yes, 0 for No): 0
Heart Disease (1 for Yes, 0 for No): 1
Ever Married (yes/no): Yes
Enter the Work Type: Private
Residence Type (urban/rural): Urban
Enter the Average Glucose Level: 228.69
Enter the BMI: 36.6
Smoking Status: formerly smoked
The Label is: 1
