# Precious Kings
## Project Title: Time Series Predictive Modeling Diabetes Progression and Health Risk Stratification Using Electronic Health Records

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split # Though we might not split for simplicity here
from sklearn.ensemble import RandomForestClassifier # Import RandomForest
from sklearn.preprocessing import StandardScaler     # For scaling features
import pickle                                       # For saving the model and scaler
import os                                           # To check if the file exists

print("Libraries imported successfully Precious")

Libraries imported successfully Precious


### Load dataset

In [2]:
# Load the dataset
data = pd.read_csv('diabetes_dataset.csv')
display(data.head())
print('Columns in dataset:', list(data.columns))

Unnamed: 0,Age,Gender,BMI,Family_History,Physical_Activity,Diet_Type,Smoking_Status,Alcohol_Intake,Stress_Level,Hypertension,...,Health_Insurance,Regular_Checkups,Medication_For_Chronic_Conditions,Pregnancies,Polycystic_Ovary_Syndrome,Glucose_Tolerance_Test_Result,Vitamin_D_Level,C_Protein_Level,Thyroid_Condition,Diabetes_Status
0,48,Male,35.5,No,High,Non-Vegetarian,Never,,Medium,Yes,...,No,No,No,0,0,124.3,31.5,7.46,Yes,Yes
1,18,Other,28.7,Yes,Medium,Non-Vegetarian,Current,Moderate,High,No,...,Yes,Yes,No,0,0,151.4,12.5,5.64,Yes,No
2,21,Other,30.0,Yes,High,Non-Vegetarian,Current,Moderate,High,Yes,...,No,No,Yes,0,0,106.1,35.8,7.2,No,Yes
3,25,Female,25.6,No,Medium,Vegetarian,Former,Moderate,High,Yes,...,No,No,Yes,1,No,85.6,15.4,6.53,Yes,No
4,78,Male,38.8,No,High,Non-Vegetarian,Current,High,High,No,...,No,No,Yes,0,0,77.0,28.6,0.58,No,Yes


Columns in dataset: ['Age', 'Gender', 'BMI', 'Family_History', 'Physical_Activity', 'Diet_Type', 'Smoking_Status', 'Alcohol_Intake', 'Stress_Level', 'Hypertension', 'Cholesterol_Level', 'Fasting_Blood_Sugar', 'Postprandial_Blood_Sugar', 'HBA1C', 'Heart_Rate', 'Waist_Hip_Ratio', 'Urban_Rural', 'Health_Insurance', 'Regular_Checkups', 'Medication_For_Chronic_Conditions', 'Pregnancies', 'Polycystic_Ovary_Syndrome', 'Glucose_Tolerance_Test_Result', 'Vitamin_D_Level', 'C_Protein_Level', 'Thyroid_Condition', 'Diabetes_Status']


###  Define Features (X) and Target (y)

In [3]:
# Define the list of feature column names
# Make sure these match your CSV column names EXACTLY!
feature_cols = ['Age', 'BMI', 'Blood Glucose', 'Blood Pressure', 'HbA1c', 'Insulin Level', 'Skin thickness', 'Pregnancies', 'Family history', 'Physical Activity', 'Smoking status', 'Alcohol Intake', 'Diet Qualtiy', 'Cholesterol', 'Triglycerides', 'Waiste ratio']
target_col = 'Outcome' # Assuming 'Outcome' is still the target column

# Check for missing columns before selecting
missing_features = [col for col in feature_cols if col not in data.columns]
missing_target = target_col not in data.columns

if missing_features:
    print(f"Missing feature columns in data: {missing_features}")
if missing_target:
    print(f"Missing target column in data: {target_col}")

if not missing_features and not missing_target:
    # Create the features DataFrame (X)
    # Using .get() with a default empty list to handle missing columns
    X = data[[col for col in feature_cols if col in data.columns]]

    # Create the target Series (y)
    y = data[target_col]

    print("Features (X) and Target (y) defined.")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
else:
    print("Please check your column names above and update feature_cols and target_col to match exactly.")

Missing feature columns in data: ['Blood Glucose', 'Blood Pressure', 'HbA1c', 'Insulin Level', 'Skin thickness', 'Family history', 'Physical Activity', 'Smoking status', 'Alcohol Intake', 'Diet Qualtiy', 'Cholesterol', 'Triglycerides', 'Waiste ratio']
Missing target column in data: Outcome
Please check your column names above and update feature_cols and target_col to match exactly.


###  Preprocess Data: Feature Scaling

In [4]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Check if X is defined before scaling
if 'X' in globals():
    # Fit the scaler to the features (X) and transform X
    X_scaled = scaler.fit_transform(X)
    print("Features scaled using StandardScaler.")
    # Optionally convert X_scaled back to a DataFrame to view it nicely
    # X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols)
    # print("\nFirst 5 rows of scaled features:")
    # print(X_scaled_df.head())
    print("\nScaler is fitted and ready to be saved.")
else:
    print("Error: 'X' is not defined. Please run the cell that defines your features (X) and target (y) before this step.")

Error: 'X' is not defined. Please run the cell that defines your features (X) and target (y) before this step.


### Train the Random Forest Model

In [5]:
# Initialize the Random Forest Classifier model
# n_estimators is the number of trees in the forest
model = RandomForestClassifier(n_estimators=100, random_state=42) # random_state for reproducibility

# Train the model only if X_scaled and y are defined
if 'X_scaled' in globals() and 'y' in globals():
    model.fit(X_scaled, y)
    print("Random Forest Classifier model trained successfully!")
else:
    print("Error: X_scaled or y is not defined. Please ensure previous cells ran successfully and column names match your data.")

Error: X_scaled or y is not defined. Please ensure previous cells ran successfully and column names match your data.


### Save the Model and Scaler

In [6]:
# Define filenames for the saved files
model_filename = 'diabetes_rf_model.pkl' # Updated filename
scaler_filename = 'scaler.pkl'

# Save the trained model
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)
print(f"Model saved successfully as '{model_filename}'")

# Save the fitted scaler
with open(scaler_filename, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
print(f"Scaler saved successfully as '{scaler_filename}'")

Model saved successfully as 'diabetes_rf_model.pkl'
Scaler saved successfully as 'scaler.pkl'


### Predict the model

In [7]:
# Make predictions using the trained model
try:
    predictions = model.predict(X_scaled)
    # Display the first 10 predictions
    print("First 10 predictions:", predictions[:10])
except Exception as e:
    print(f"Prediction error: {e}\nMake sure the model has been trained (fit) before calling predict. Run the training cell above and check for errors.")

Prediction error: name 'X_scaled' is not defined
Make sure the model has been trained (fit) before calling predict. Run the training cell above and check for errors.


### Calculate the F1 Score

In [8]:
# Import the f1_score function
from sklearn.metrics import f1_score

# Calculate the F1 score only if y and predictions are defined
if 'y' in globals() and 'predictions' in globals():
    try:
        f1 = f1_score(y, predictions)
        # Display the F1 score
        print(f'F1 Score: {f1:.2f}')
    except ValueError as ve:
        print(f"ValueError: {ve}\nCheck that y and predictions have the same length and are both defined. If you changed the number of features or retrained the model, make sure you reran all previous cells and that your scaler/model files are up to date.")
else:
    print("Error: y or predictions is not defined. Please ensure previous cells ran successfully and the model was trained and used for prediction.")

Error: y or predictions is not defined. Please ensure previous cells ran successfully and the model was trained and used for prediction.


# Save the fitted scaler using joblib (do not overwrite scaler.pkl)
This cell saves the fitted StandardScaler to a separate file (`scaler_joblib.pkl`) to avoid interfering with the main app's scaler.

In [None]:
import joblib

# Assuming `scaler` is your fitted StandardScaler object
joblib.dump(scaler, 'scaler_joblib.pkl')
print("Scaler saved as scaler_joblib.pkl (joblib format).")