In [38]:
import pandas as pd 

stroke_dft  = pd.read_csv('healthcare_dataset.csv')
stroke_dft.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [39]:
stroke_dft.dtypes


id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [40]:
#  fill in  missing value using the median value of the column
stroke_dft['bmi'] = stroke_dft['bmi'].fillna(stroke_dft['bmi'].median())
stroke_dft['bmi_category'] = stroke_dft['bmi'].apply(lambda bmi: 'Underweight' if bmi < 20   else 'Normal' if bmi < 25 
    else 'Overweight' if bmi < 30     else 'Obese' )
stroke_dft['age_catergory'] = stroke_dft['age'].apply( lambda age: 'Child' if age < 18   else 'Young_Adult' if age < 35 
    else 'Adult' if age < 60    else 'Senior')
stroke_dft['glucose_category'] = stroke_dft['avg_glucose_level'].apply( lambda gl: 'Low' if gl < 70  else 'Normal' if gl <= 140 
    else 'Prediabetic' if gl<= 200    else 'Diabetic' )
stroke_dft['ever_married_bool'] = stroke_dft['ever_married'].map({'Yes': 1, 'No': 0})
stroke_dft['residence_type_bool'] = stroke_dft['Residence_type'].map({'Urban': 1, 'Rural': 0})
stroke_dft.drop(columns=['id'], inplace=True)


stroke_dft_cleaned = pd.get_dummies(stroke_dft, columns=['gender','age_catergory','bmi_category','work_type','glucose_category','smoking_status'], prefix=['g','age','bmi','work_type','gl_catergory','smoking_status'])

# Save cleaned dataset for testing purposes 
stroke_dft_cleaned.to_csv("stroke_data_cleaned.csv", index=False)


In [41]:
print("age_catergory",stroke_dft['age_catergory'].unique().tolist())
print("age_catergory",stroke_dft['age_catergory'].nunique())
print("bmi_category",stroke_dft['bmi_category'].unique().tolist())
print("bmi_category",stroke_dft['bmi_category'].nunique())
print("gender",stroke_dft['gender'].unique().tolist())
print("gender",stroke_dft['gender'].nunique())
print("ever_married",stroke_dft['ever_married'].unique().tolist())
print("ever_married",stroke_dft['ever_married'].nunique())
print("work_type",stroke_dft['work_type'].unique().tolist())
print("work_type",stroke_dft['work_type'].nunique())
print("Residence_type",stroke_dft['Residence_type'].unique().tolist())
print("Residence_type",stroke_dft['Residence_type'].nunique())
print("smoking_status",stroke_dft['smoking_status'].unique().tolist())
print("smoking_status",stroke_dft['smoking_status'].nunique())


age_catergory ['Senior', 'Adult', 'Child', 'Young_Adult']
age_catergory 4
bmi_category ['Obese', 'Overweight', 'Normal', 'Underweight']
bmi_category 4
gender ['Male', 'Female', 'Other']
gender 3
ever_married ['Yes', 'No']
ever_married 2
work_type ['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']
work_type 5
Residence_type ['Urban', 'Rural']
Residence_type 2
smoking_status ['formerly smoked', 'never smoked', 'smokes', 'Unknown']
smoking_status 4


Multiple Variable Linear Regression testing .

In [42]:
avoid_columns_regression  = [ 
 'ever_married',
 'Residence_type']
reg_analysis_dft  = stroke_dft_cleaned
reg_analysis_dft.drop(columns=avoid_columns_regression, inplace=True)
reg_analysis_dft.head(5)



Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,ever_married_bool,residence_type_bool,g_Female,g_Male,...,work_type_Self-employed,work_type_children,gl_catergory_Diabetic,gl_catergory_Low,gl_catergory_Normal,gl_catergory_Prediabetic,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,1,1,False,True,...,False,False,True,False,False,False,False,True,False,False
1,61.0,0,0,202.21,28.1,1,1,0,True,False,...,True,False,True,False,False,False,False,False,True,False
2,80.0,0,1,105.92,32.5,1,1,0,False,True,...,False,False,False,False,True,False,False,False,True,False
3,49.0,0,0,171.23,34.4,1,1,1,True,False,...,False,False,False,False,False,True,False,False,False,True
4,79.0,1,0,174.12,24.0,1,1,0,True,False,...,True,False,False,False,False,True,False,False,True,False


In [43]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.metrics import mean_absolute_error, r2_score

y_axis  = reg_analysis_dft['stroke']
reg_analysis_dft.drop(columns=['stroke'], inplace=True)
x_axis = reg_analysis_dft.copy()

scaler = StandardScaler()
num_cols = ['age', 'avg_glucose_level', 'bmi']
x_axis[num_cols] = scaler.fit_transform(x_axis[num_cols])

# Split data: train, validation, test (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(x_axis,y_axis, test_size=0.3, stratify=y_axis, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)


val_preds = mlr_model.predict(X_val)
test_preds = mlr_model.predict(X_test)

# Convert to binary classification
val_preds_binary = np.round(val_preds)
test_preds_binary = np.round(test_preds)

val_rmse = mean_squared_error(y_val, val_preds, squared=False)
test_rmse = mean_squared_error(y_test, test_preds, squared=False)

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


print("Validation Report:")
print(classification_report(y_val, val_preds_binary))

print("Test Report:")
print(classification_report(y_test, test_preds_binary))

#  Overfitting Check
if abs(val_rmse - test_rmse) < 0.02:
    print(" No overfitting detected: validation and test performance are consistent.")
else:
    print(" Potential overfitting: consider regularization or more data.")


Validation RMSE: 0.2040
Test RMSE: 0.2089
Validation Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       729
           1       0.00      0.00      0.00        37

    accuracy                           0.95       766
   macro avg       0.48      0.50      0.49       766
weighted avg       0.91      0.95      0.93       766

Test Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       729
           1       0.00      0.00      0.00        38

    accuracy                           0.95       767
   macro avg       0.48      0.50      0.49       767
weighted avg       0.90      0.95      0.93       767

 No overfitting detected: validation and test performance are consistent.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
new_person_raw = {
    'age': 67,
    'avg_glucose_level': 228.69,
    'bmi': 36.6,
    'hypertension': 0,
    'heart_disease': 1,
    'ever_married_bool': 1,
    'residence_type_bool': 1,
    'age_Adult': 0,
    'age_Child': 0,
    'age_Senior': 1,
    'age_Young_Adult': 0,
    'bmi_Normal': 0,
    'bmi_Obese': 1,
    'bmi_Overweight': 0,
    'bmi_Underweight': 0,
    'work_type_Govt_job': 0,
    'work_type_Never_worked': 0,
    'work_type_Private': 1,
    'work_type_Self-employed': 0,
    'work_type_children': 0,
    'gl_catergory_Diabetic': 1,
    'gl_catergory_Low': 0,
    'gl_catergory_Normal': 0,
    'gl_catergory_Prediabetic': 0,
    'smoking_status_Unknown': 0,
    'smoking_status_formerly smoked': 1,
    'smoking_status_never smoked': 0,
    'smoking_status_smokes': 0,
    'g_Male': 1  ,
    'g_Female': 0,
    'g_Other': 0 
}

# ✅ Prepare input array
raw_input = pd.DataFrame([new_person_raw])
raw_input[['age', 'avg_glucose_level', 'bmi']] = scaler.transform(raw_input[['age', 'avg_glucose_level', 'bmi']])

# 🔮 Predict stroke
pred =mlr_model.predict(raw_input)[0]
risk = "⚠️ High risk of stroke" if pred >= 0.5 else "✅ Low risk of stroke"
print(f"🔍 Predicted stroke risk score: {pred:.4f} → {risk}")

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
