In [32]:
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

theme_colors = ['#E9EAEC', '#FAD02C', '#90ADC6', '#333652']

In [33]:
!pip install imbalanced-learn





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
df=pd.read_csv(r"C:\COLLEGE\last sem\dataset\healthcare-dataset-stroke-data.csv")

In [35]:
df = df.drop(['id'], axis=1) # removes the specified column of 'id'
df['age'] = df['age'].astype('int') # cast the panda object 'age' to a specified data type 'int'

In [36]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [37]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.215264,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.633866,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.0,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [38]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [39]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [40]:
df = df[df['gender'] != 'Other']

# CORRELATION MATRIX

In [41]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [42]:
print(df.dtypes)


gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [43]:
import pandas as pd
import plotly.graph_objects as go

# Convert binary categorical columns
df_encoded = df.copy()
df_encoded['ever_married'] = df_encoded['ever_married'].map({'Yes': 1, 'No': 0})
df_encoded['Residence_type'] = df_encoded['Residence_type'].map({'Urban': 1, 'Rural': 0})

# one-hot encoding
df_encoded = pd.get_dummies(df_encoded, columns=['work_type', 'smoking_status'], drop_first=True)

df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

df_encoded = df_encoded.fillna(0)

#correlation matrix
corr = df_encoded.corr()

#heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=corr.values, 
    x=corr.index, 
    y=corr.columns, 
    colorscale='Viridis', 
    colorbar=dict(title="Correlation")
))

# Update layout
fig.update_layout(
    title="<b>Correlation Matrix Heatmap</b>",
    title_x=0.5,
    titlefont=dict(size=20, family="Arial"),
    template="plotly_dark",
    autosize=True
)

# Show the heatmap
fig.show()


# STROKE SUNBURSTS

In [44]:
## Grouping Datasets
gender_stroke_df = df.groupby(['gender', 'stroke']).size().reset_index().rename(columns={0: 'count'})

hypertension_stroke_df = df.groupby(['hypertension', 'stroke']).size().reset_index().rename(columns={0: 'count'})

married_stroke_df = df.groupby(['ever_married', 'stroke']).size().reset_index().rename(columns={0: 'count'})

work_type_stroke_df = df.groupby(['work_type', 'stroke']).size().reset_index().rename(columns={0: 'count'})

residence_stroke_df = df.groupby(['Residence_type', 'stroke']).size().reset_index().rename(columns={0: 'count'})

smoking_stroke_df = df.groupby(['smoking_status', 'stroke']).size().reset_index().rename(columns={0: 'count'})

## Creating Sunburst Figures
sb1 = px.sunburst(gender_stroke_df, values='count', path=['gender', 'stroke'])
sb2 = px.sunburst(hypertension_stroke_df, values='count', path=['hypertension', 'stroke'])

sb3 = px.sunburst(married_stroke_df, values='count', path=['ever_married', 'stroke'])
sb4 = px.sunburst(work_type_stroke_df, values='count', path=['work_type', 'stroke'])

sb5 = px.sunburst(residence_stroke_df, values='count', path=['Residence_type', 'stroke'])
sb6 = px.sunburst(smoking_stroke_df, values='count', path=['smoking_status', 'stroke'])

## Subplots
fig = make_subplots(rows=3, cols=2, specs=[
    [{"type": "sunburst"}, {"type": "sunburst"}],
    [{"type": "sunburst"}, {"type": "sunburst"}],
    [{"type": "sunburst"}, {"type": "sunburst"}]
], subplot_titles=("Gender and Stroke", "Hypertension and Stroke",
                   "Married and Stroke", "Work Type and Stroke",
                   "Residence and Stroke", "Smoking and Stroke"))

## Plotting Figures
fig.add_trace(sb1.data[0], row=1, col=1)
fig.add_trace(sb2.data[0], row=1, col=2)
fig.add_trace(sb3.data[0], row=2, col=1)
fig.add_trace(sb4.data[0], row=2, col=2)
fig.add_trace(sb5.data[0], row=3, col=1)
fig.add_trace(sb6.data[0], row=3, col=2)

fig.update_traces(textinfo="label+percent parent")

# Update title and height
fig.update_layout(title_text="Stroke Sunbursts", title_x=0.5, height=1300, template='plotly_dark', showlegend=False,
        font=dict(
            family="Rubik",
            size=14)
)

fig.show()

# DATA PREPROCESSING

In [45]:
# Converting gender, ever_married and Residence_type into 0's and 1's
df['gender']=df['gender'].apply(lambda x : 1 if x=='Male' else 0) 
df["ever_married"] = df["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
df["Residence_type"] = df["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)

In [46]:
df = pd.get_dummies(data=df, columns=['smoking_status'])
df = pd.get_dummies(data=df, columns=['work_type'])

In [47]:
df = df.dropna()

In [48]:
X = df.drop(['stroke'], axis=1).values 
y = df['stroke'].values

# MACHINE LEARNING MODELS

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #t80-20 train rtest split

In [51]:
len(X_test) #train-3926    test-982

982

In [52]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check the new class distribution
pd.Series(y_train_balanced).value_counts()


0    3744
1    3744
Name: count, dtype: int64

# RANDOM FOREST

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, classification_report
import pandas as pd

# Initialize and train the model using the balanced dataset
rfc = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 100})
rfc.fit(X_train_balanced, y_train_balanced)

# Model evaluation
rfc_score = rfc.score(X_train_balanced, y_train_balanced)
rfc_test = rfc.score(X_test, y_test)

# Predictions with probability threshold adjustment
y_pred_proba = rfc.predict_proba(X_test)[:, 1]  # Get probability scores for class 1
y_pred = (y_pred_proba > 0.15).astype(int)   # Lower threshold to 0.3

# Evaluate model performance
print(classification_report(y_test, y_pred))

# Confusion matrix
rfc_cm = confusion_matrix(y_test, y_pred)
conf_matrix = pd.DataFrame(data=rfc_cm, columns=['Predicted:0', 'Predicted:1'], index=['Actual:0', 'Actual:1'])

# Precision
precision = precision_score(y_test, y_pred, average='binary')  # Adjust average as needed

# Print results with correct rounding
print(f'Training Score: {round(rfc_score, 3)}')
print(f'Testing Score: {round(rfc_test, 3)}')
print(f'Accuracy Score: {round(accuracy_score(y_test, y_pred), 3)}')
print(f'Precision Score: {round(precision, 3)}')

# Display confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)



# Function to make predictions from user input
def predict_stroke(user_input):
    # Ensure input is a DataFrame with the same structure as training data
    input_df = pd.DataFrame([user_input])
    
    # Apply the same preprocessing steps as in training
    categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
    input_df = pd.get_dummies(input_df, columns=categorical_columns)
    
    # Align columns with the trained model
    missing_cols = set(feature_columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0  # Add missing columns with default value 0

    input_df = input_df[feature_columns]  # Ensure correct column order

    # Make prediction
    prediction_prob = model.predict_proba(input_df)[0][1]  # Probability of stroke
    prediction = "Stroke" if prediction_prob > 0.5 else "No Stroke"

    return {"prediction": prediction, "confidence": round(prediction_prob, 2)}

# Example usage (for testing)
user_input_example = {
    "gender": "Male", "age": 67, "hypertension": 0, "heart_disease": 1,
    "ever_married": "Yes", "work_type": "Private", "Residence_type": "Urban",
    "avg_glucose_level": 228.69, "bmi": 36.6, "smoking_status": "formerly smoked"
}

print(predict_stroke(user_input_example))


              precision    recall  f1-score   support

           0       0.98      0.89      0.93       955
           1       0.09      0.37      0.14        27

    accuracy                           0.87       982
   macro avg       0.53      0.63      0.54       982
weighted avg       0.96      0.87      0.91       982

Training Score: 1.0
Testing Score: 0.965
Accuracy Score: 0.875
Precision Score: 0.086

Confusion Matrix:
          Predicted:0  Predicted:1
Actual:0          849          106
Actual:1           17           10


In [54]:
import pandas as pd

# Convert y_test to a Pandas Series
pd.Series(y_test).value_counts()


0    955
1     27
Name: count, dtype: int64

In [55]:
import numpy as np

# Check unique values in predictions
np.unique(y_pred, return_counts=True)


(array([0]), array([982]))

In [56]:
rfc_conf = confusion_matrix(y_test, y_pred)
rfc_report = classification_report(y_test, y_pred)
rfc_acc = round(accuracy_score(y_test, y_pred)*100, ndigits = 2)
print(f"Confusion Matrix : \n\n{rfc_conf}")
print(f"\nClassification Report : \n\n{rfc_report}")
print(f"\nThe Accuracy of Random Forest Classifier is {rfc_acc} %")

Confusion Matrix : 

[[955   0]
 [ 27   0]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       955
           1       0.00      0.00      0.00        27

    accuracy                           0.97       982
   macro avg       0.49      0.50      0.49       982
weighted avg       0.95      0.97      0.96       982


The Accuracy of Random Forest Classifier is 97.25 %



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [57]:
z=rfc_cm
fig = ff.create_annotated_heatmap(z,
                                  name=True, 
                                  x=['Actual No Stroke','Actual Stroke'], 
                                  y=['Predicted No Stroke','Predicted Stroke'], 
                                  colorscale='RdPu',
                                  xgap=3,ygap=3)
fig['data'][0]['showscale'] = True

fig.update_layout(title_text='<b>Confusion Matrix<b>',
                  title_x=0.5,
                  titlefont={'size': 24, 'family': 'Courier New'},
                  width=700, height=400,
                  xaxis_showgrid=False,
                  yaxis_showgrid=False,
                  yaxis_autorange='reversed', 
                  paper_bgcolor=None,
                  template='plotly_dark'
                  )

fig.show()