In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle

In [None]:
# Import the dataset
df = pd.read_csv('stroke1.csv')

In [None]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [None]:
#Pre-Processing
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,6624.0,6624.0,6624.0,6624.0,6624.0,6378.0,6624.0
mean,36596.990036,42.816202,0.093297,0.050725,105.621046,28.903888,0.037591
std,21119.832392,22.57274,0.29087,0.219451,44.915845,7.865558,0.190218
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17891.0,25.0,0.0,0.0,77.12,23.5,0.0
50%,37089.5,44.0,0.0,0.0,91.55,28.1,0.0
75%,54620.0,60.0,0.0,0.0,113.355,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [None]:
# Categorical Data to Numeric
category_data = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

for data in category_data:
    df[data] = LabelEncoder().fit_transform(df[data])

df.head()

# Changed Data Values are:
""" Smoking Status -> formerly smoked = 1, never smoked = 2, smokes = 3, unknown = 0
    Residence -> Urban = 1, Rural = 0
    Gender -> Male = 1, Female = 0, Other = 2
    Work Type -> Private = 2, Self-Employed = 3, Govtjob = 0, children = 1, never_worked = 4
    marriage status -> Married = 1, Single = 0
 """

' Smoking Status -> formerly smoked = 1, never smoked = 2, smokes = 3, unknown = 0\n    Residence -> Urban = 1, Rural = 0\n    Gender -> Male = 1, Female = 0, Other = 2\n    Work Type -> Private = 2, Self-Employed = 3, Govtjob = 0, children = 1, never_worked = 4\n    marriage status -> Married = 1, Single = 0\n '

In [None]:
#Check For Null Values
df.isnull()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
6619,False,False,False,False,False,False,False,False,False,False,False,False
6620,False,False,False,False,False,False,False,False,False,False,False,False
6621,False,False,False,False,False,False,False,False,False,False,False,False
6622,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
# Check For Null Values and replace them with mean
mean = np.mean(df['bmi'])
df['bmi'].fillna(mean, inplace=True)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.903888,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [None]:
# Drop the 'id' column
df = df.drop(columns='id')

In [None]:
# Normalization (0-1 Range) for selected columns
cols_to_normalize = ['age', 'work_type', 'avg_glucose_level', 'bmi', 'smoking_status']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

In [None]:
# Features-target set
x = df.drop(columns='stroke')
y = df['stroke']

In [None]:
# Split train-dataset (80%) and test-data (20%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# Logistic Regression Model
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)

In [None]:
# Predict test-data set using Logistic Regression
y_predict_logistic = logistic_model.predict(x_test)

In [None]:
# Check accuracy for Logistic Regression
accuracy_logistic = accuracy_score(y_test, y_predict_logistic)
print("Logistic Regression Accuracy:", accuracy_logistic)

Logistic Regression Accuracy: 0.9584905660377359


In [None]:
# Random Forest Model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(x_train, y_train)

In [None]:
# Predict test-data set using Random Forest
y_predict_rf = random_forest_model.predict(x_test)

In [None]:
# Check accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_predict_rf)
print("Random Forest Accuracy:", accuracy_rf)

Random Forest Accuracy: 0.9562264150943396


In [None]:
# SVM Model
svm_model = SVC()
svm_model.fit(x_train, y_train)

# Predict test-data set using SVM
y_predict_svm = svm_model.predict(x_test)

# Check accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_predict_svm)
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.9584905660377359


In [None]:
# Create a new DataFrame for input reference
datas = pd.read_csv('stroke1.csv')

# Change categorical-data to numerical-data
for data in category_data:
    datas[data] = LabelEncoder().fit_transform(datas[data])

# Use fillna to replace null-values
mean = np.mean(datas['bmi'])
datas['bmi'].fillna(mean, inplace=True)

datas = datas.drop(columns='id')
# Print the first few rows of the DataFrame to check the changes
print(datas.head())

   gender   age  hypertension  heart_disease  ever_married  work_type  \
0       1  67.0             0              1             1          2   
1       0  61.0             0              0             1          3   
2       1  80.0             0              1             1          2   
3       0  49.0             0              0             1          2   
4       0  79.0             1              0             1          3   

   Residence_type  avg_glucose_level        bmi  smoking_status  stroke  
0               1             228.69  36.600000               1       1  
1               0             202.21  28.903888               2       1  
2               0             105.92  32.500000               2       1  
3               1             171.23  34.400000               3       1  
4               0             174.12  24.000000               2       1  


In [None]:
max(datas['bmi'])

97.6

In [None]:
# Function for prediction
def prediction(g, a, hyt, ht, m, w, r, gl, b, s):
    # Gender
    if g == "male":
        g = 1
    elif g == "female":
        g = 0
    else:
        g = 2

    # Age
    a = ((a - min(datas['age'])) / (max(datas['age']) - min(datas['age'])))

    # Hypertension
    if hyt == "yes":
        hyt = 1
    elif hyt == "no":
        hyt = 0

    # Heart issues
    if ht == "yes":
        ht = 1
    elif ht == "no":
        ht = 0

    # Married
    if m == "yes":
        m = 1
    elif m == "no":
        m = 0

    # Work-type
    if w == "government":
        w = 0
    elif w == "student":
        w = 1
    elif w == "private":
        w = 2
    elif w == "self-employed":
        w = 3
    else:
        w = 4

    # Residence-type
    if r == "urban":
        r = 1
    else:
        r = 0

    # Glucose-levels
    if gl == "i do not know":
        gl = np.mean(datas['avg_glucose_level'])
    else:
        gl = ((int(gl) - min(datas['avg_glucose_level'])) / (max(datas['avg_glucose_level']) - min(datas['avg_glucose_level'])))

    # BMI
    b = ((b - min(datas['bmi'])) / (max(datas['bmi']) - min(datas['bmi'])))

    # Smoking
    if s == "unknown":
        s = 0
    elif s == "never smoked":
        s = 1
    elif s == "formerly smoked":
        s = 2
    elif s == "smokes":
        s = 3

    pred_logistic_prob = logistic_model.predict_proba([[g, a, hyt, ht, m, w, r, gl, b, s]])[0, 1]
    pred_rf = random_forest_model.predict([[g, a, hyt, ht, m, w, r, gl, b, s]])

    return pred_logistic_prob, pred_rf[0]

# Input from user
g = input("Enter your gender: ").lower()
a = int(input("Enter your age: "))
hyt = input("Do you have hypertension? yes or no: ").lower()
ht = input("Do you have any heart disease? yes or no: ").lower()
m = input("Have you been married? yes or no: ").lower()
w = input("Work type? private/self-employed/student/government/others: ").lower()
r = input("Residency type? rural or urban? ").lower()
gl = input('Enter glucose levels. Enter value or type "i do not know": ').lower()
b = int(input("Enter BMI: "))
s = input("Smoking Status => unknown/never smoked/formerly smoked/smokes: ").lower()

# Call prediction function
op_logistic_prob, op_rf = prediction(g, a, hyt, ht, m, w, r, gl, b, s)

# Adjusted threshold values
no_risk_threshold = 1.0
risk_threshold = 0.1

# Classify predictions based on thresholds
if op_logistic_prob >= no_risk_threshold:
    print("Logistic Regression Prediction: Patient has no risk of stroke")
elif op_logistic_prob >= risk_threshold:
    print("Logistic Regression Prediction: Person has chances of having a stroke")
else:
    print("Logistic Regression Prediction: Patient has a significant risk of stroke")

if op_rf == 1:
    print("Random Forest Prediction: Person has chances of having a stroke")
else:
    print("Random Forest Prediction: Patient has no risk of stroke")

Logistic Regression Prediction: Person has chances of having a stroke
Random Forest Prediction: Patient has no risk of stroke




In [None]:
# Save the models to pkl files
with open('logistic_model.pkl', 'wb') as logistic_model_file:
    pickle.dump(logistic_model, logistic_model_file)

with open('random_forest_model.pkl', 'wb') as rf_model_file:
    pickle.dump(random_forest_model, rf_model_file)