In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

In [2]:
df = pd.read_csv("static_health_risk_dataset_200_rows.csv")

df.head()

Unnamed: 0,BMI,Genetic_Risk,Age_Risk_Multiplier,Baseline_Risk,Risk_Class
0,27.0,0.89,1.76,0.1,High
1,24.4,0.77,1.17,0.12,Within Range
2,27.6,0.73,1.09,0.16,Within Range
3,31.1,0.73,1.67,0.12,High
4,24.1,0.42,1.02,0.13,Within Range


In [4]:
# Check missing values
print(df.isnull().sum())

# Drop duplicates if any
df = df.drop_duplicates()

# Ensure no negative or invalid BMI
df = df[df["BMI"] > 0]

df.shape

BMI                    0
Genetic_Risk           0
Age_Risk_Multiplier    0
Baseline_Risk          0
Risk_Class             0
dtype: int64


(200, 5)

In [5]:
X = df[["BMI", "Genetic_Risk", "Age_Risk_Multiplier", "Baseline_Risk"]]
y = df["Risk_Class"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
model = GaussianNB()

model.fit(X_train_scaled, y_train)

In [9]:
y_pred = model.predict(X_test_scaled)

In [11]:
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy*100)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 82.5

Classification Report:

              precision    recall  f1-score   support

        High       0.77      0.83      0.80        12
         Low       1.00      0.33      0.50         3
Within Range       0.85      0.88      0.86        25

    accuracy                           0.82        40
   macro avg       0.87      0.68      0.72        40
weighted avg       0.83      0.82      0.82        40



In [12]:
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:

[[10  0  2]
 [ 0  1  2]
 [ 3  0 22]]


In [13]:
new_patient = np.array([[26.5, 0.7, 1.2, 0.3]])

new_patient_scaled = scaler.transform(new_patient)

probabilities = model.predict_proba(new_patient_scaled)

print("Class Probabilities:", probabilities)
print("Predicted Class:", model.predict(new_patient_scaled))

Class Probabilities: [[4.44116124e-01 6.16127130e-44 5.55883876e-01]]
Predicted Class: ['Within Range']




In [14]:
if accuracy >= 0.80:
    joblib.dump(model, "static_risk_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    print("Model Saved Successfully!")
else:
    print("Accuracy below 95%, model not saved.")

Model Saved Successfully!


In [15]:
from google.colab import files

files.download("static_risk_model.pkl")
files.download("scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
!pip install imbalanced-learn



In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE

import joblib

In [18]:
df = pd.read_csv("static_health_risk_dataset_5000_rows.csv")

print(df.shape)
df.head()

(5000, 5)


Unnamed: 0,BMI,Genetic_Risk,Age_Risk_Multiplier,Baseline_Risk,Risk_Class
0,28.0,0.21,1.18,0.24,Within Range
1,25.4,0.23,1.01,0.24,Low
2,28.6,0.49,1.83,0.35,Within Range
3,32.1,0.32,1.28,0.26,Within Range
4,25.1,0.29,1.28,0.05,Low


In [19]:
# Remove duplicates
df = df.drop_duplicates()

# Remove invalid BMI
df = df[(df["BMI"] > 15) & (df["BMI"] < 45)]

print(df.isnull().sum())

BMI                    0
Genetic_Risk           0
Age_Risk_Multiplier    0
Baseline_Risk          0
Risk_Class             0
dtype: int64


In [20]:
# Composite Risk Score
df["Composite_Risk"] = (
    df["BMI"] * 0.3 +
    df["Genetic_Risk"] * 0.3 +
    df["Age_Risk_Multiplier"] * 0.2 +
    df["Baseline_Risk"] * 0.2
)

# Interaction Features
df["BMI_Genetic"] = df["BMI"] * df["Genetic_Risk"]
df["Age_Baseline"] = df["Age_Risk_Multiplier"] * df["Baseline_Risk"]

In [21]:
X = df.drop("Risk_Class", axis=1)
y = df["Risk_Class"]

In [22]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)

In [24]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_scaled, y_train)

print("Best Parameters:", grid.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [26]:
best_model = grid.best_estimator_

In [27]:
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5)

print("Cross Validation Accuracy:", cv_scores.mean())

Cross Validation Accuracy: 0.9868705526484776


In [28]:
y_pred = best_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", accuracy)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9922547332185886

Classification Report:

              precision    recall  f1-score   support

        High       0.99      1.00      0.99       774
         Low       0.99      1.00      0.99       775
Within Range       1.00      0.98      0.99       775

    accuracy                           0.99      2324
   macro avg       0.99      0.99      0.99      2324
weighted avg       0.99      0.99      0.99      2324


Confusion Matrix:

[[774   0   0]
 [  0 775   0]
 [  9   9 757]]


In [29]:
new_patient = np.array([[26.5, 0.7, 1.3, 0.25,
                         26.5*0.3 + 0.7*0.3 + 1.3*0.2 + 0.25*0.2,
                         26.5*0.7,
                         1.3*0.25]])

new_patient_scaled = scaler.transform(new_patient)

probabilities = best_model.predict_proba(new_patient_scaled)

print("Class Probabilities:", probabilities)
print("Predicted Class:", best_model.predict(new_patient_scaled))

Class Probabilities: [[0. 0. 1.]]
Predicted Class: ['Within Range']




In [30]:
if accuracy >= 0.95:
    joblib.dump(best_model, "static_risk_model.pkl")
    joblib.dump(scaler, "scaler.pkl")
    print("Model saved successfully!")
else:
    print("Accuracy below 95%, model not saved.")

Model saved successfully!


In [31]:
from google.colab import files

files.download("static_risk_model.pkl")
files.download("scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>