In [2]:
# --- save_artifacts.py ---
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import joblib 
import os

# Define the directory to save artifacts
ARTIFACTS_DIR = 'model_artifacts' 
os.makedirs(ARTIFACTS_DIR, exist_ok=True) # Create directory if it doesn't exist

print("Loading data...")
# --- IMPORTANT: Use the *exact* same dataset as in your final notebook ---
data = pd.read_excel(r"C:\Users\ASUS\Downloads\V5_Capstone_Final_Dataset.xlsx") 

print("Preprocessing data...")
# --- Apply the *exact* same preprocessing ---
data['Year'] = data['REF_DATE']
# Handle potential division by zero or missing population data
data['Actual Population'] = data['Actual Population'].replace(0, np.nan) # Replace 0 with NaN
data.dropna(subset=['Actual Population'], inplace=True) # Drop rows where population is missing
data['Diabetes_per_capita'] = (data['Diabetes'] / data['Actual Population']) * 1000
data['HBP_per_capita'] = (data['High Blood Pressure'] / data['Actual Population']) * 1000
# Handle potential infinite values if Diabetes/HBP were non-zero but population was NaN/0 before drop
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(subset=['Diabetes_per_capita', 'HBP_per_capita'], inplace=True)


# Define selected features - MUST MATCH THE ORDER USED FOR TRAINING/SCALING
selected_features = [
    'Sugar and confectionery', 'Eggs', 'Bakery products', 'Butter', 
    'Dairy products', 'Cheese', 'Fresh vegetables', 
    'Preserved fruit and fruit preparations', 'Fish', 
    'Non-alcoholic beverages', 'Preserved vegetables and vegetable preparations', 
    'Actual Population', 'Year' 
]
# Ensure no NaN/inf values remain in selected features or target vars before training
data.dropna(subset=selected_features + ['Diabetes_per_capita', 'HBP_per_capita'], inplace=True)


print(f"Final data shape for training: {data.shape}")
if data.empty:
    raise ValueError("Data is empty after preprocessing and NaN removal. Check your data and preprocessing steps.")


X = data[selected_features]
y_diabetes = data['Diabetes_per_capita']
y_hbp = data['HBP_per_capita']

print("Scaling features...")
# Fit the scaler ONCE on the full data (or training split if you had one)
scaler = StandardScaler()
scaler.fit(X) # Fit the scaler

print("Training Diabetes model...")
# --- Use your FINAL chosen parameters --- 
# Example using parameters from Code 1 initially, but ideally use tuned ones
model_d = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, random_state=42) 
# model_d = XGBRegressor(**best_params_from_gridsearch_d) # Or use tuned params
model_d.fit(scaler.transform(X), y_diabetes) # Train on scaled data

print("Training HBP model...")
# --- Use your FINAL chosen parameters ---
model_hbp = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, random_state=42)
# model_hbp = XGBRegressor(**best_params_from_gridsearch_hbp) # Or use tuned params
model_hbp.fit(scaler.transform(X), y_hbp) # Train on scaled data

print("Saving artifacts...")
# Save the models, scaler, and feature list
joblib.dump(model_d, os.path.join(ARTIFACTS_DIR, 'diabetes_model.joblib'))
joblib.dump(model_hbp, os.path.join(ARTIFACTS_DIR, 'hbp_model.joblib'))
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, 'scaler.joblib'))
joblib.dump(selected_features, os.path.join(ARTIFACTS_DIR, 'selected_features.pkl'))

print(f"Artifacts saved successfully in '{ARTIFACTS_DIR}' directory.")

Loading data...
Preprocessing data...
Final data shape for training: (800, 32)
Scaling features...
Training Diabetes model...
Training HBP model...
Saving artifacts...
Artifacts saved successfully in 'model_artifacts' directory.
