In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

Preparing data set

In [3]:
df = pd.read_csv("stroke_risk_dataset.csv")

In [4]:
df.head()

Unnamed: 0,Chest Pain,Shortness of Breath,Irregular Heartbeat,Fatigue & Weakness,Dizziness,Swelling (Edema),Pain in Neck/Jaw/Shoulder/Back,Excessive Sweating,Persistent Cough,Nausea/Vomiting,High Blood Pressure,Chest Discomfort (Activity),Cold Hands/Feet,Snoring/Sleep Apnea,Anxiety/Feeling of Doom,Age,Stroke Risk (%),At Risk (Binary)
0,0,1,1,1,0,0,0,1,1,1,0,1,1,0,0,54,58.0,1
1,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,49,40.5,0
2,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,62,52.0,1
3,1,0,1,1,0,1,1,1,1,1,1,0,0,0,0,48,60.0,1
4,0,0,1,0,0,1,0,1,0,1,1,0,0,1,1,61,56.5,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Chest Pain                      70000 non-null  int64  
 1   Shortness of Breath             70000 non-null  int64  
 2   Irregular Heartbeat             70000 non-null  int64  
 3   Fatigue & Weakness              70000 non-null  int64  
 4   Dizziness                       70000 non-null  int64  
 5   Swelling (Edema)                70000 non-null  int64  
 6   Pain in Neck/Jaw/Shoulder/Back  70000 non-null  int64  
 7   Excessive Sweating              70000 non-null  int64  
 8   Persistent Cough                70000 non-null  int64  
 9   Nausea/Vomiting                 70000 non-null  int64  
 10  High Blood Pressure             70000 non-null  int64  
 11  Chest Discomfort (Activity)     70000 non-null  int64  
 12  Cold Hands/Feet                 

In [6]:
df.describe()

Unnamed: 0,Chest Pain,Shortness of Breath,Irregular Heartbeat,Fatigue & Weakness,Dizziness,Swelling (Edema),Pain in Neck/Jaw/Shoulder/Back,Excessive Sweating,Persistent Cough,Nausea/Vomiting,High Blood Pressure,Chest Discomfort (Activity),Cold Hands/Feet,Snoring/Sleep Apnea,Anxiety/Feeling of Doom,Age,Stroke Risk (%),At Risk (Binary)
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,0.501871,0.496471,0.498886,0.5001,0.503043,0.500557,0.499186,0.503671,0.5008,0.501943,0.500643,0.499557,0.498871,0.500686,0.499871,54.056429,55.558771,0.6492
std,0.5,0.499991,0.500002,0.500004,0.499994,0.500003,0.500003,0.49999,0.500003,0.5,0.500003,0.500003,0.500002,0.500003,0.500004,21.071567,14.300898,0.477224
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,5.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,45.5,0.0
50%,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0,55.5,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,72.0,66.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,90.0,100.0,1.0


In [7]:
df.isnull().sum()

Chest Pain                        0
Shortness of Breath               0
Irregular Heartbeat               0
Fatigue & Weakness                0
Dizziness                         0
Swelling (Edema)                  0
Pain in Neck/Jaw/Shoulder/Back    0
Excessive Sweating                0
Persistent Cough                  0
Nausea/Vomiting                   0
High Blood Pressure               0
Chest Discomfort (Activity)       0
Cold Hands/Feet                   0
Snoring/Sleep Apnea               0
Anxiety/Feeling of Doom           0
Age                               0
Stroke Risk (%)                   0
At Risk (Binary)                  0
dtype: int64

In [8]:
# Based on our previous analysis, most columns contain binary data.
# We now inspect each column to confirm it's data is valid.
# Columns such as 'Age' and 'Stroke Risk' are skipped, as they contain values with different numerical ranges.

binary_columns = [col for col in df.columns if col not in ['Age', 'Stroke Risk (%)']]
non_binary_columns = []

for col in binary_columns:
    unique_values = set(df[col].unique())
    if unique_values != {0, 1}:
        non_binary_columns.append({
            'column': col,
            'values': sorted(unique_values)
        })

# Results
if len(non_binary_columns) == 0:
    print("All symptom columns contain only binary values (0 and 1)")
else:
    print(f"Found {len(non_binary_columns)} columns with non-binary values:")
    for issue in non_binary_columns:        
        print(f"   - {issue['column']}: {[int(val) for val in issue['values']]}")

All symptom columns contain only binary values (0 and 1)


In [9]:
# Check Age range
age_min = df['Age'].min()
age_max = df['Age'].max()
print(f"Age range: {age_min} to {age_max} years")

Age range: 18 to 90 years


In [10]:
# Check for outliers in Age column

Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
print(outliers)

Empty DataFrame
Columns: [Chest Pain, Shortness of Breath, Irregular Heartbeat, Fatigue & Weakness, Dizziness, Swelling (Edema), Pain in Neck/Jaw/Shoulder/Back, Excessive Sweating, Persistent Cough, Nausea/Vomiting, High Blood Pressure, Chest Discomfort (Activity), Cold Hands/Feet, Snoring/Sleep Apnea, Anxiety/Feeling of Doom, Age, Stroke Risk (%), At Risk (Binary)]
Index: []


In [11]:
# Check Stroke Risk percentage range
risk_min = df['Stroke Risk (%)'].min()
risk_max = df['Stroke Risk (%)'].max()
print(f"Stroke Risk range: {risk_min:.2f}% to {risk_max:.2f}%")

Stroke Risk range: 5.00% to 100.00%


In [12]:
# Check for outliers in Stroke Risk (%) column

Q1 = df['Stroke Risk (%)'].quantile(0.25)
Q3 = df['Stroke Risk (%)'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['Stroke Risk (%)'] < lower_bound) | (df['Stroke Risk (%)'] > upper_bound)]
print(outliers[['Stroke Risk (%)']])


       Stroke Risk (%)
2498             100.0
6423              98.5
8544              98.0
8772             100.0
8962              97.5
...                ...
61361             13.0
64253             13.0
65189             99.5
66866             12.0
68195             11.5

[72 rows x 1 columns]


In [13]:
# Remove outliers found in Stroke Risk (%) column

df = df[(df['Stroke Risk (%)'] >= lower_bound) & (df['Stroke Risk (%)'] <= upper_bound)]

In [14]:
# Scale the Age column to a range between 0 and 1 to make learning easier for the model.  

scaler = MinMaxScaler()
df['Age_Scaled'] = scaler.fit_transform(df[['Age']])
df = df.drop(columns=['Age'])  # Drop the original 'Age' column

In [15]:
# Check for imbalance to decide on splitting method.
print(df['At Risk (Binary)'].value_counts())
print(df['At Risk (Binary)'].value_counts(normalize=True) * 100) # Percentage

At Risk (Binary)
1    45406
0    24522
Name: count, dtype: int64
At Risk (Binary)
1    64.932502
0    35.067498
Name: proportion, dtype: float64


In [16]:
# Stratified splitting to ensure that the proportion of classes in all subsets

# Use Stroke Risk (%) as the target
y = df['Stroke Risk (%)']

# Drop Stroke Risk (%) from the features to avoid data leakage
X = df.drop(columns=['Stroke Risk (%)', 'At Risk (Binary)'])

# Split the data into training (80%) and test (20%) using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Split the training set into training (80%) and validation (20%) using stratified sampling
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)


# X_train, y_train: Training data
# X_val, y_val: Validation data
# X_test, y_test: Test data

We used stratified sampling when splitting the data, which ensures that the proportions of both classes (1 and 0) in the target column "At Risk (Binary)" stays the same in all splits. This is helpful because the dataset is imbalanced.

Picture this:
Your model learns from one distribution (15/85)
But is tested on a different distribution (30/70)
This makes performance evaluation unreliable

Stratification helps prevent bias toward the majority class by maintaining consistent class proportions across all data splits.

In [17]:
#TODO fix note on this
#3 models: LinearRegression, RandomForestRegressor, GradientBoostingRegressor
# LinearRegression has problem idk

In [18]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define models to evaluate
models = {
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Linear Regression": LinearRegression()
}

def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # Train
    model.fit(X_train, y_train)
    
    # Validate
    y_val_pred = model.predict(X_val)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Test
    y_test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return {
        "Validation MAE": val_mae,
        "Validation MSE": val_mse,
        "Validation R²": val_r2,
        "Test MAE": test_mae,
        "Test MSE": test_mse,
        "Test R²": test_r2
    }

# Evaluate all models and store results
results = {}
for name, model in models.items():
    print(f"\nEvaluating {name}:")
    results[name] = evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test)
    
    # Print results for current model
    for metric, value in results[name].items():
        print(f"{metric}: {value:.4f}")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print("\nSummary Table:")
print(results_df)


Evaluating Gradient Boosting:
Validation MAE: 1.6041
Validation MSE: 4.0391
Validation R²: 0.9801
Test MAE: 1.5911
Test MSE: 4.0101
Test R²: 0.9802

Evaluating Random Forest:
Validation MAE: 2.2824
Validation MSE: 8.2631
Validation R²: 0.9592
Test MAE: 2.2800
Test MSE: 8.2909
Test R²: 0.9591

Evaluating Linear Regression:
Validation MAE: 0.0000
Validation MSE: 0.0000
Validation R²: 1.0000
Test MAE: 0.0000
Test MSE: 0.0000
Test R²: 1.0000

Summary Table:
                   Validation MAE  Validation MSE  Validation R²  \
Gradient Boosting    1.604135e+00    4.039142e+00       0.980064   
Random Forest        2.282426e+00    8.263078e+00       0.959215   
Linear Regression    1.725630e-13    4.508407e-26       1.000000   

                       Test MAE      Test MSE   Test R²  
Gradient Boosting  1.591073e+00  4.010091e+00  0.980222  
Random Forest      2.280020e+00  8.290874e+00  0.959108  
Linear Regression  1.721832e-13  4.481388e-26  1.000000  


1. MAE (Mean Absolute Error)
What it is: How "wrong" your guesses are on average.
Example:
Your score: 1.59
This means your model is usually 1.59% off (e.g., guessing "58%" when the truth is "56.41%").

2. MSE (Mean Squared Error)
What it is: Punishes big mistakes MORE than small ones.
Example:
Your score: 4.01 (but √4.01 ≈ 2.0)
Your guesses are typically ±2% away from the truth.

3. R² (R-Squared)
How good is the model.

Overfitting
In machine learning, overfitting occurs when an algorithm fits too closely or even exactly to its training data, resulting in a model that can't make accurate predictions or conclusions from any data other than the training data.

Your model is overfitting your training data when you see that the model performs well on the training data but does not perform well on the evaluation data. This is because the model is memorizing the data it has seen and is unable to generalize to unseen examples.