In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
# --- 1. Load the Dataset ---
try:
    df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'healthcare-dataset-stroke-data.csv' not found. Please ensure the file is in the correct directory.")
    exit()


print(df.head())

Dataset loaded successfully.
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [3]:
# --- 2. Initial Data Cleaning ---
df = df.drop('id', axis=1)
df = df[df['gender']!= 'Other']
print("Initial data cleaning complete.")

Initial data cleaning complete.


In [4]:
# --- 3. Define Features and Target ---
X = df.drop('stroke', axis=1)
y = df['stroke']
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

In [5]:
# --- 4. Create Preprocessing Pipelines ---
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
print("Preprocessing pipelines created.")

Preprocessing pipelines created.


In [6]:
# --- 5. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split into training ({X_train.shape} rows) and testing ({X_test.shape} rows) sets.")

Data split into training ((4087, 10) rows) and testing ((1022, 10) rows) sets.


In [7]:
# --- 6. Define the Model and Final Pipeline with SMOTE ---
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])
print("Model pipeline with SMOTE defined.")

Model pipeline with SMOTE defined.


In [8]:
# --- 7. Train the Model ---
print("Training the Random Forest model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")

Training the Random Forest model...
Model training complete.


In [9]:
# --- 8. Evaluate the Model ---
print("\n--- Model Evaluation ---")
y_pred = model_pipeline.predict(X_test)


--- Model Evaluation ---


In [10]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['no stroke', 'stroke']))

Accuracy: 93.44%

Confusion Matrix:
[[950  22]
 [ 45   5]]

Classification Report:
              precision    recall  f1-score   support

   no stroke       0.95      0.98      0.97       972
      stroke       0.19      0.10      0.13        50

    accuracy                           0.93      1022
   macro avg       0.57      0.54      0.55      1022
weighted avg       0.92      0.93      0.93      1022



In [11]:
df_stroke_0 = df[df['stroke'] == 0]
df_stroke_1 = df[df['stroke'] == 1]
n_samples_0 = min(5, len(df_stroke_0))
n_samples_1 = min(5, len(df_stroke_1))
random_samples_0 = df_stroke_0.sample(n=n_samples_0, random_state=42)
random_samples_1 = df_stroke_1.sample(n=n_samples_1, random_state=42)
random_samples = pd.concat([random_samples_0, random_samples_1])
X_random = random_samples.drop('stroke', axis=1)
y_random = random_samples['stroke']
predictions_random = model_pipeline.predict(X_random)
print("Actual vs Predicted values for random samples from the original dataset:")
for i in range(len(random_samples)):
    print(f"Sample {i+1}: Actual = {y_random.iloc[i]}, Predicted = {predictions_random[i]}")

Actual vs Predicted values for random samples from the original dataset:
Sample 1: Actual = 0, Predicted = 0
Sample 2: Actual = 0, Predicted = 0
Sample 3: Actual = 0, Predicted = 0
Sample 4: Actual = 0, Predicted = 0
Sample 5: Actual = 0, Predicted = 0
Sample 6: Actual = 1, Predicted = 0
Sample 7: Actual = 1, Predicted = 1
Sample 8: Actual = 1, Predicted = 1
Sample 9: Actual = 1, Predicted = 1
Sample 10: Actual = 1, Predicted = 1


In [13]:
# --- Get Input from User and Predict ---
user_input = {}
print("Please provide the following information for the prediction:")
features_info = {
    'gender': ' (Male, Female, Other - Note: "Other" was removed during cleaning, so use Male or Female)',
    'age': ' (e.g., 65.0)',
    'hypertension': ' (0 for no, 1 for yes)',
    'heart_disease': ' (0 for no, 1 for yes)',
    'ever_married': ' (Yes or No)',
    'work_type': ' (Govt_job, Never_worked, Private, Self-employed, children)',
    'Residence_type': ' (Rural or Urban)',
    'avg_glucose_level': ' (e.g., 200.0)',
    'bmi': ' (e.g., 35.0)',
    'smoking_status': ' (formerly smoked, never smoked, smokes, Unknown)'
}
for feature, info in features_info.items():
    while True:
        input_value = input(f"Enter value for {feature}{info}: ")
        if feature in ['age', 'avg_glucose_level', 'bmi']:
            try:
                user_input[feature] = float(input_value)
                break
            except ValueError:
                print("Invalid input. Please enter a numerical value.")
        elif feature in ['hypertension', 'heart_disease']:
            if input_value in ['0', '1']:
                user_input[feature] = int(input_value)
                break
            else:
                print("Invalid input. Please enter 0 or 1.")
        else:
            user_input[feature] = input_value
            break
new_input_data_user = pd.DataFrame([user_input])
user_prediction = model_pipeline.predict(new_input_data_user)
user_prediction_proba = model_pipeline.predict_proba(new_input_data_user)
print("\nPrediction for the provided input data:")
if user_prediction[0] == 1:
    print("Predicted: Stroke")
else:
    print("Predicted: No Stroke")

print("\nPrediction probabilities (No Stroke, Stroke):")
print(user_prediction_proba)

Please provide the following information for the prediction:
Enter value for gender (Male, Female, Other - Note: "Other" was removed during cleaning, so use Male or Female): Male
Enter value for age (e.g., 65.0): 67
Enter value for hypertension (0 for no, 1 for yes): 0
Enter value for heart_disease (0 for no, 1 for yes): 1
Enter value for ever_married (Yes or No): Yes
Enter value for work_type (Govt_job, Never_worked, Private, Self-employed, children): Private
Enter value for Residence_type (Rural or Urban): Urban
Enter value for avg_glucose_level (e.g., 200.0): 228.69
Enter value for bmi (e.g., 35.0): 36.6
Enter value for smoking_status (formerly smoked, never smoked, smokes, Unknown): formerly smoked

Prediction for the provided input data:
Predicted: Stroke

Prediction probabilities (No Stroke, Stroke):
[[0.02 0.98]]
