# Kingsley and Ossai
## Project Title: Time Series Predictive Modeling Diabetes Progression and Health Risk Stratification Using Electronic Health Records

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import StandardScaler    
import pickle                                       
import os                                           

print("Libraries imported successfully Kingsley-Ossai")

### Load dataset

In [None]:
# Load the dataset
data = pd.read_csv('diabetes_dataset.csv')
display(data.head())
print('Columns in dataset:', list(data.columns))

###  Define Features (X) and Target (y)

In [None]:
feature_cols = ['Age', 'BMI', 'Blood Glucose', 'Blood Pressure', 'HbA1c', 'Insulin Level', 'Skin thickness', 'Pregnancies', 'Family history', 'Physical Activity', 'Smoking status', 'Alcohol Intake', 'Diet Type', 'Cholesterol', 'Triglycerides', 'Waiste ratio']
target_col = 'Outcome'

missing_features = [col for col in feature_cols if col not in data.columns]
missing_target = target_col not in data.columns

if missing_features:
    print(f"Missing feature columns in data: {missing_features}")
if missing_target:
    print(f"Missing target column in data: {target_col}")

if not missing_features and not missing_target:
    X = data[[col for col in feature_cols if col in data.columns]]

    y = data[target_col]

    print("Features (X) and Target (y) defined.")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
else:
    print("Please check your column names above and update feature_cols and target_col to match exactly.")

###  Preprocess Data: Feature Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

if 'X' in globals():
    X_scaled = scaler.fit_transform(X)
    print("Features scaled using StandardScaler.")
    print("\nScaler is fitted and ready to be saved.")
else:
    print("Error: 'X' is not defined. Please run the cell that defines your features (X) and target (y) before this step.")

### Train the Random Forest Model

In [None]:
# Initialize the Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42) # random_state for reproducibility

# Train the model only if X_scaled and y are defined
if 'X_scaled' in globals() and 'y' in globals():
    model.fit(X_scaled, y)
    print("Random Forest Classifier model trained successfully!")
else:
    print("Error: X_scaled or y is not defined. Please ensure previous cells ran successfully and column names match your data.")

### Save the Model and Scaler

In [None]:
# Define filenames for the saved files
model_filename = 'diabetes_rf_model.pkl'    
scaler_filename = 'scaler.pkl'

# Save the trained model
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)
print(f"Model saved successfully as '{model_filename}'")

# Save the fitted scaler
with open(scaler_filename, 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
print(f"Scaler saved successfully as '{scaler_filename}'")

### Predict the model

In [None]:
try:
    predictions = model.predict(X_scaled)
    print("First 10 predictions:", predictions[:10])
except Exception as e:
    print(f"Prediction error: {e}\nMake sure the model has been trained (fit) before calling predict. Run the training cell above and check for errors.")

### Calculate the F1 Score

In [None]:
# import the f1_score function
from sklearn.metrics import f1_score

if 'y' in globals() and 'predictions' in globals():
    try:
        f1 = f1_score(y, predictions)
        print(f'F1 Score: {f1:.2f}')
    except ValueError as ve:
        print(f"ValueError: {ve}\nCheck that y and predictions have the same length and are both defined. If you changed the number of features or retrained the model, make sure you reran all previous cells and that your scaler/model files are up to date.")
else:
    print("Error: y or predictions is not defined. Please ensure previous cells ran successfully and the model was trained and used for prediction.")

# Save the fitted scaler using joblib (do not overwrite scaler.pkl)
This cell saves the fitted StandardScaler to a separate file (`scaler_joblib.pkl`) to avoid interfering with the main app's scaler.

In [None]:
import joblib

joblib.dump(scaler, 'scaler_joblib.pkl')
print("Scaler saved as scaler_joblib.pkl (joblib format).")