In [26]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
dataset_path = 'dataset.csv'  # Adjust path if necessary
data = pd.read_csv(dataset_path)

In [None]:
# Data Preprocessing
# Fill missing 'bmi' values with the median, and 'smoking_status' with 'Unknown'
data['bmi'] = data['bmi'].fillna(data['bmi'].median())
data['smoking_status'] = data['smoking_status'].fillna('Unknown')

In [None]:
# Encode categorical features
data_encoded = pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

# Fill any remaining missing values in the encoded data
data_encoded = data_encoded.fillna(data_encoded.median(numeric_only=True))

In [None]:
# Separate features and target variable
X = data_encoded.drop(['id', 'stroke'], axis=1)
y = data_encoded['stroke']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check and fill any remaining NaN values in X_train before applying SMOTE
X_train = X_train.fillna(X_train.median(numeric_only=True))

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_balanced, y_train_balanced)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluation metrics
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


In [None]:
# Display Classification Report
print("Classification Report:\n", classification_rep)


In [None]:
# Visualization of the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Stroke Prediction with SMOTE')
plt.show()

In [None]:
# Feature importance visualization
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.show()

In [None]:
# Example: Predicting stroke probability for new data
# Sample input as a DataFrame with the same feature columns as X
new_data = pd.DataFrame({
    'age': [45],
    'hypertension': [1],
    'heart_disease': [0],
    'avg_glucose_level': [105.3],
    'bmi': [28.7],
    # Dummy encoding similar to data_encoded
    'gender_Male': [1],
    'ever_married_Yes': [1],
    'work_type_Self-employed': [0],
    'work_type_Private': [1],
    'work_type_Govt_job': [0],
    'work_type_children': [0],
    'Residence_type_Urban': [1],
    'smoking_status_formerly smoked': [0],
    'smoking_status_never smoked': [1],
    'smoking_status_smokes': [0]
})

In [None]:
# Ensure new_data has only the columns that X has (fill missing columns if necessary)
for col in X.columns:
    if col not in new_data.columns:
        new_data[col] = 0
new_data = new_data[X.columns]  # Reorder columns

# Predict stroke probability
prediction = model.predict(new_data)
print("Prediction for new data (1 = Stroke, 0 = No Stroke):", prediction[0])