# Potential Risk Prediction

## Model training

### Import libraries


In [1]:
import pandas as pd
import numpy as np
import os
import json
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# define the path to the save model
models_dir = os.path.join(r'C:\\Users\\echo\\Documents\\fyp\\backend', 'models')
os.makedirs(models_dir, exist_ok=True)

### Load datasets

In [3]:
diabetes_df = pd.read_csv('data/diabetes.csv')
heart_df = pd.read_csv('data/heart.csv')
stroke_df = pd.read_csv('data/stroke.csv')
symptoms_df = pd.read_csv('data/symptoms.csv')

### Diabetes Model

In [4]:
diabetes_df.drop(columns=['DiabetesPedigreeFunction'], inplace=True)
diabetes_df.drop(columns=['Insulin'], inplace=True)
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,Age,Outcome
0,6,148,72,35,33.6,50,1
1,1,85,66,29,26.6,31,0
2,8,183,64,0,23.3,32,1
3,1,89,66,23,28.1,21,0
4,0,137,40,35,43.1,33,1
...,...,...,...,...,...,...,...
763,10,101,76,48,32.9,63,0
764,2,122,70,27,36.8,27,0
765,5,121,72,23,26.2,30,0
766,1,126,60,0,30.1,47,1


In [5]:
diabetes_df.drop_duplicates(inplace=True)
diabetes_df.duplicated().sum()

np.int64(0)

In [6]:
# Check for missing values in the dataframe
diabetes_df.dropna(inplace=True)
diabetes_df.isna().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
BMI              0
Age              0
Outcome          0
dtype: int64

In [7]:
# Check for negative values in the relevant columns and remove them\n",
diabetes_df = diabetes_df[diabetes_df['Glucose'] > 0]
diabetes_df = diabetes_df[diabetes_df['BloodPressure'] > 0]
diabetes_df = diabetes_df[diabetes_df['BMI'] > 0]

In [8]:
# Separate features and target
x_diabetes = diabetes_df.drop('Outcome', axis=1) #Features
y_diabetes = diabetes_df['Outcome'] #Target

#### Pipeline Preprocessing

In [9]:
# Identify numeric columns
num_features = diabetes_df.columns.drop('Outcome')

In [10]:
# Create numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [11]:
# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features)
])

In [12]:
# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [13]:
# Parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__class_weight': ['balanced', None]
}

#### Split and scale

In [14]:
# Split the data into training and testing sets
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(x_diabetes, y_diabetes, test_size=0.2, random_state=42)

In [15]:
# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_d, y_train_d)


In [16]:
# Best model & score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'classifier__class_weight': None, 'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best cross-validation accuracy: 0.6378551771585828


In [17]:
# Evaluate on test set
test_score = grid_search.score(X_test_d, y_test_d)
print("Test set accuracy:", test_score)

Test set accuracy: 0.6588235294117647


In [18]:
# Generate predictions for the test set
y_pred_d = grid_search.predict(X_test_d)

# Evaluate accuracy
accuracy = accuracy_score(y_test_d, y_pred_d)
print(f'Diabetes Model Accuracy: {accuracy * 100:.2f}%')

# More detailed evaluation (classification report)
print("Classification Report:\n", classification_report(y_test_d, y_pred_d))

Diabetes Model Accuracy: 80.00%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.86       102
           1       0.67      0.65      0.66        43

    accuracy                           0.80       145
   macro avg       0.76      0.76      0.76       145
weighted avg       0.80      0.80      0.80       145



#### Save Model

In [19]:
# Save diabetes model
diabetes_model_path = os.path.join(models_dir, 'diabetes_model.joblib')
joblib.dump(grid_search.best_estimator_, diabetes_model_path)
print(f"Diabetes model saved: {os.path.exists(diabetes_model_path)}")

Diabetes model saved: True


#### Feature Importance

In [20]:
# Access the RandomForestClassifier from the pipeline and get feature importances
diabetes_feature_importance = grid_search.best_estimator_.named_steps['classifier'].feature_importances_

# Print feature importance for model
print("Diabetes Model Feature Importance:", diabetes_feature_importance)

feature_names = x_diabetes.columns

# Create a dictionary to store feature importance
importance_dict = {feature_names[i]: round(float(diabetes_feature_importance[i]), 2)  # Format to 2 decimal places
                   for i in range(len(feature_names))}

# Save the feature importance to a JSON file
with open('diabetes_model_feature_importance.json', 'w') as f:
    json.dump(importance_dict, f)

Diabetes Model Feature Importance: [0.08859982 0.34271107 0.09225911 0.08743824 0.21541678 0.17357498]


### Heart Model

In [21]:
# Prepare data
heart_df = heart_df[['age', 'sex', 'cp', 'chol', 'thalch', 'num']]

In [22]:
# Rename the columns for better readability
heart_df.columns = ['Age', 'Gender', 'ChestPainType', 'Cholesterol','MaxHR','Outcome']

In [23]:
heart_df['ChestPainType'].unique()

array(['typical angina', 'asymptomatic', 'non-anginal', 'atypical angina'],
      dtype=object)

In [24]:
# Cleaning data
heart_df = heart_df.dropna()
heart_df = heart_df.drop_duplicates()
heart_df

Unnamed: 0,Age,Gender,ChestPainType,Cholesterol,MaxHR,Outcome
0,63,Male,typical angina,233.0,150.0,0
1,67,Male,asymptomatic,286.0,108.0,2
2,67,Male,asymptomatic,229.0,129.0,1
3,37,Male,non-anginal,250.0,187.0,0
4,41,Female,atypical angina,204.0,172.0,0
...,...,...,...,...,...,...
913,62,Male,asymptomatic,170.0,138.0,1
914,46,Male,asymptomatic,310.0,126.0,2
915,54,Female,asymptomatic,333.0,154.0,1
917,55,Male,asymptomatic,223.0,100.0,2


In [25]:
# Separate features and target
x_heart = heart_df.drop('Outcome', axis=1) #Features
# binarize the target variable
# Convert the target variable to binary (0 or 1), instead of 0, 1, 2, 3
y_heart = heart_df['Outcome'].apply(lambda x: 1 if x > 0 else 0) #Target

#### Pipeline Preprocessing

In [26]:
# Identify numeric columns
num_features = heart_df.columns.drop(['Outcome', 'Gender', 'ChestPainType'])
cat_features = ['Gender', 'ChestPainType']

In [27]:
# Create numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [28]:
# create categorical transformer
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

In [29]:
# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [30]:
# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [31]:
# Parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [15, 20, None],
    'classifier__min_samples_split': [3, 5],
    'classifier__class_weight': ['balanced', None]
}

#### Split and scale

In [32]:
# Split the data into training and testing sets
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(x_heart, y_heart, test_size=0.2, random_state=42)

In [33]:
# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_h, y_train_h)


In [34]:
# Best model & score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'classifier__class_weight': None, 'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best cross-validation accuracy: 0.7953407462317765


In [35]:
# Evaluate on test set
test_score = grid_search.score(X_test_h, y_test_h)
print("Test set accuracy:", test_score)

Test set accuracy: 0.8349514563106796


In [36]:
# Generate predictions for the test set
y_pred_h = grid_search.predict(X_test_h)

# Evaluate accuracy
accuracy = accuracy_score(y_test_h, y_pred_h)
print(f'Heart Model Accuracy: {accuracy * 100:.2f}%')

# More detailed evaluation (classification report)
print("Classification Report:\n", classification_report(y_test_h, y_pred_h))

Heart Model Accuracy: 79.76%
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74        64
           1       0.84      0.83      0.83       104

    accuracy                           0.80       168
   macro avg       0.79      0.79      0.79       168
weighted avg       0.80      0.80      0.80       168



#### Save Model

In [37]:
# Save heart model
heart_model_path = os.path.join(models_dir, 'heart_model.joblib')
joblib.dump(grid_search.best_estimator_, heart_model_path)
print(f"Heart model saved: {os.path.exists(heart_model_path)}")

Heart model saved: True


#### Feature Importance

In [38]:
# Access the RandomForestClassifier from the pipeline and get feature importances
heart_feature_importance = grid_search.best_estimator_.named_steps['classifier'].feature_importances_

# Print feature importance for model
print("Heart Model Feature Importance:", heart_feature_importance)

feature_names = x_heart.columns

# Create a dictionary to store feature importance
importance_dict = {feature_names[i]: round(float(heart_feature_importance[i]), 2)  # Format to 2 decimal places
                   for i in range(len(feature_names))}

with open('heart_model_feature_importance.json', 'w') as f:
    json.dump(importance_dict, f)

Heart Model Feature Importance: [0.20031686 0.24610667 0.25070505 0.06642831 0.13913765 0.0812106
 0.01609485]


### Stroke Model

In [39]:
# Prepare data
stroke_df = stroke_df[['gender', 'age', 'hypertension', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']]

In [40]:
# Rename the columns for better readability
stroke_df.columns = ['Gender', 'Age', 'Hypertension', 'GlucoseLevel', 'BMI', 'SmokingStatus', 'Outcome']

In [41]:
# Cleaning data
stroke_df = stroke_df.dropna()
stroke_df = stroke_df.drop_duplicates()
stroke_df = stroke_df[~stroke_df['SmokingStatus'].isin(["Unknown", "formerly smoked"])]
stroke_df = stroke_df[stroke_df['Gender'] != "Other"]
stroke_df

Unnamed: 0,Gender,Age,Hypertension,GlucoseLevel,BMI,SmokingStatus,Outcome
2,Male,80.0,0,105.92,32.5,never smoked,1
3,Female,49.0,0,171.23,34.4,smokes,1
4,Female,79.0,1,174.12,24.0,never smoked,1
6,Male,74.0,1,70.09,27.4,never smoked,1
7,Female,69.0,0,94.39,22.8,never smoked,1
...,...,...,...,...,...,...,...
5096,Male,57.0,0,76.62,28.2,never smoked,0
5100,Male,82.0,1,71.97,28.3,never smoked,0
5102,Female,57.0,0,77.93,21.7,never smoked,0
5106,Female,81.0,0,125.20,40.0,never smoked,0


In [42]:
stroke_df['SmokingStatus'].unique()

array(['never smoked', 'smokes'], dtype=object)

In [43]:
x_stroke = stroke_df.drop('Outcome', axis=1) #Features
y_stroke = stroke_df['Outcome'] #Target

#### Pipeline Preprocessing

In [44]:
# Identify numeric columns
num_features = stroke_df.columns.drop(['SmokingStatus', 'Gender', 'Outcome'])
cat_features = ['Gender', 'SmokingStatus']
bin_features = ['Hypertension']

In [45]:
# Create numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [46]:
# create categorical transformer
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

In [47]:
# create binary transformer
bin_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    # ('label', LabelEncoder())
])

In [48]:
# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features),
    ('bin', bin_transformer, bin_features)
])

In [49]:
# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [50]:
# Parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [200, 300],
    'classifier__max_depth': [20, 30, None],
    'classifier__min_samples_split': [2, 4],
    'classifier__class_weight': ['balanced', None]
}

#### Split and scale

In [51]:
# Split the data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    x_stroke, y_stroke, test_size=0.2, random_state=42)

In [52]:
# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_s, y_train_s)


In [53]:
# Best model & score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 30, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 200}
Best cross-validation accuracy: 0.07723809523809524


In [54]:
# Evaluate on test set
test_score = grid_search.score(X_test_s, y_test_s)
print("Test set accuracy:", test_score)

Test set accuracy: 0.0


In [55]:
# Generate predictions for the test set
y_pred_s = grid_search.predict(X_test_s)

# Evaluate accuracy
accuracy = accuracy_score(y_test_s, y_pred_s)
print(f'Stroke Model Accuracy: {accuracy * 100:.2f}%')

# More detailed evaluation (classification report)
print("Classification Report:\n", classification_report(y_test_s, y_pred_s))

Stroke Model Accuracy: 95.56%
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       499
           1       0.00      0.00      0.00        19

    accuracy                           0.96       518
   macro avg       0.48      0.50      0.49       518
weighted avg       0.93      0.96      0.94       518



#### Save Model

In [56]:
# Save stroke model
stroke_model_path = os.path.join(models_dir, 'stroke_model.joblib')
joblib.dump(grid_search.best_estimator_, stroke_model_path)
print(f"Stroke model saved: {os.path.exists(stroke_model_path)}")

Stroke model saved: True


In [57]:
# Apply SMOTE to the stroke dataset
# smote = SMOTE(random_state=42)

# # Apply SMOTE to the stroke dataset
# X_train_s_resampled, y_train_s_resampled = smote.fit_resample(X_train_s, y_train_s)

# # Print the class distribution after applying SMOTE
# # Convert categorical columns to numeric using one-hot encoding
# # Convert categorical columns to numeric using one-hot encoding
# X_train_s_resampled = pd.get_dummies(X_train_s_resampled, columns=cat_features, drop_first=True)
# X_train_s_resampled = pd.get_dummies(X_train_s_resampled, columns=cat_features, drop_first=True)

# # Print the class distribution after applying SMOTE
# print("Stroke dataset class distribution after SMOTE:", y_train_s_resampled.value_counts())

#### Feature Importance

In [58]:
# Access the RandomForestClassifier from the pipeline and get feature importances
stroke_feature_importance = grid_search.best_estimator_.named_steps['classifier'].feature_importances_

# Print feature importance for model
print("Stroke Model Feature Importance:", stroke_feature_importance)

feature_names = x_stroke.columns

# Create a dictionary to store feature importance
importance_dict = {feature_names[i]: round(float(stroke_feature_importance[i]), 2)  # Format to 2 decimal places
                   for i in range(len(feature_names))}

with open('stroke_model_feature_importance.json', 'w') as f:
    json.dump(importance_dict, f)

Stroke Model Feature Importance: [0.45331539 0.02863866 0.24512516 0.2011941  0.02040351 0.02497272
 0.02635045]


### Symptom Model (not used)

In [59]:
# symptoms_df

In [60]:
# # Cleaning data
# symptoms_df = symptoms_df.dropna()
# # symptoms_df = symptoms_df.drop_duplicates()

In [61]:
# symptoms_df['prognosis'].unique()

In [62]:
# # Separate features and target
# x_symptoms = symptoms_df.drop('prognosis', axis=1) #Features
# # binarize the target variable
# # Convert the target variable to binary (0 or 1), instead of 0, 1, 2, 3
# y_symptoms = symptoms_df['prognosis'] #Target

#### Pipeline Preprocessing

In [63]:
# Identify numeric columns
# num_features = symptoms_df.columns.drop('prognosis')

In [64]:
# # Create numerical transformer
# num_transformer = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', RobustScaler())
# ])

In [65]:
# ColumnTransformer
# preprocessor = ColumnTransformer([
#     ('num', num_transformer, num_features)
# ])

In [66]:
# # Final pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier(random_state=42))
# ])

In [67]:
# # Parameter grid for GridSearchCV
# param_grid = {
#     'classifier__n_estimators': [150, 200],
#     'classifier__max_depth': [15, 20, None],
#     'classifier__min_samples_split': [3, 5],
#     'classifier__class_weight': ['balanced', None]
# }

#### Split and scale

In [68]:
# Split the data into training and testing sets
# X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(x_symptoms, y_symptoms, test_size=0.2, random_state=42)

In [69]:
# # Grid search
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
# grid_search.fit(X_train_sp, y_train_sp)


In [70]:
# # Best model & score
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation accuracy:", grid_search.best_score_)

In [71]:
# # Evaluate on test set
# test_score = grid_search.score(X_test_sp, y_test_sp)
# print("Test set accuracy:", test_score)

In [72]:
# # Generate predictions for the test set
# y_pred_sp = grid_search.predict(X_test_sp)

# # Evaluate accuracy
# accuracy = accuracy_score(y_test_sp, y_pred_sp)
# print(f'Accuracy: {accuracy * 100:.2f}%')

# # More detailed evaluation (classification report)
# print("Classification Report:\n", classification_report(y_test_sp, y_pred_sp))

#### Feature Importance

In [73]:
# # For each prognosis
# for prognosis in symptoms_df['prognosis'].unique():
#     # Create binary target (1 for this prognosis, 0 for others)
#     y_binary = (symptoms_df['prognosis'] == prognosis).astype(int)
    
#     # Train a model
#     X = symptoms_df.drop('prognosis', axis=1)
#     model = RandomForestClassifier(random_state=42)
#     model.fit(X, y_binary)
    
#     # Get feature importance
#     feature_importance = model.feature_importances_
    
#     # Store results
#     prognosis_feature_importance[prognosis] = {
#         X.columns[i]: round(float(feature_importance[i]), 3) for i in range(len(feature_importance))
#     }

# # Print the feature importance for each prognosis
# for prognosis, importance in prognosis_feature_importance.items():
#     print(f"Feature importance for {prognosis}:")
#     print(json.dumps(importance, indent=4))

In [74]:
# # Access the RandomForestClassifier from the pipeline and get feature importances
# symptoms_feature_importance = grid_search.best_estimator_.named_steps['classifier'].feature_importances_

# # Print feature importance for model
# print("Symptoms Model Feature Importance:", symptoms_feature_importance)

# feature_names = x_symptoms.columns

# # Create a dictionary to store feature importance
# importance_dict = {feature_names[i]: round(float(symptoms_feature_importance[i]), 2)  # Format to 2 decimal places
#                    for i in range(len(feature_names))}

# with open('Symptoms_model_feature_importance.json', 'w') as f:
#     json.dump(importance_dict, f)