# 🤖 Intelligent Predictor PRO++++ — Model Experiments Notebook

**Purpose:** Development and experimentation environment for ML models, hyperparameter tuning, and model comparison

**Author:** Data Science Team  
**Date:** 2025-10-09  
**Version:** 2.0.1

---

## 📋 Table of Contents

1. [Setup & Imports](#setup)
2. [Data Loading & Preprocessing](#data-loading)
3. [Baseline Models](#baseline)
4. [AutoML Pipeline](#automl)
5. [Hyperparameter Tuning](#hpo)
6. [Model Comparison](#comparison)
7. [Ensemble Methods](#ensemble)
8. [Feature Importance & SHAP](#shap)
9. [Model Registry](#registry)
10. [Time Series Forecasting](#timeseries)
11. [Anomaly Detection](#anomaly)
12. [Model Deployment](#deployment)

---

## 1. Setup & Imports

In [None]:
import os, sys, json, time, warnings
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy import stats

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             mean_squared_error, mean_absolute_error, r2_score, confusion_matrix)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler

try:
    import lightgbm as lgb
except Exception: lgb = None
try:
    import xgboost as xgb
except Exception: xgb = None
try:
    import catboost as cb
    _HAS_CAT = True
except Exception:
    _HAS_CAT = False

try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Optional project modules (safe import)
def _safe_import(mod, attr):
    try:
        m = __import__(mod, fromlist=[attr])
        return getattr(m, attr)
    except Exception:
        return None

sys.path.insert(0, str(Path.cwd()))
sys.path.insert(0, str(Path.cwd().parent))

clean_data = _safe_import('src.data_processing.data_cleaner','clean_data')
engineer_features = _safe_import('src.data_processing.feature_engineering','engineer_features')
train_automl = _safe_import('src.ml_models.automl_pipeline','train_automl')
detect_problem_type = _safe_import('src.ml_models.automl_pipeline','detect_problem_type')
register_model = _safe_import('src.ml_models.model_registry','register_model')
list_models = _safe_import('src.ml_models.model_registry','list_models')
get_best_model = _safe_import('src.ml_models.model_registry','get_best_model')
load_model = _safe_import('src.ml_models.model_registry','load_model')
get_shap_values = _safe_import('src.ml_models.explainability','get_shap_values')
plot_shap_summary = _safe_import('src.ml_models.explainability','plot_shap_summary')
forecast = _safe_import('src.ml_models.forecasting','forecast')
detect_anomalies = _safe_import('src.ml_models.anomaly_detection','detect_anomalies')

# Fallbacks
if clean_data is None:
    def clean_data(df, remove_duplicates=True, handle_missing='auto'):
        out = df.copy()
        if remove_duplicates: out = out.drop_duplicates()
        if handle_missing=='auto':
            for c in out.select_dtypes(include=[np.number]).columns: out[c]=out[c].fillna(out[c].median())
            for c in out.select_dtypes(include=['object','category']).columns:
                mv = out[c].mode()
                out[c]=out[c].fillna(mv.iloc[0] if not mv.empty else 'NA')
        return out

if engineer_features is None:
    def engineer_features(df, **kwargs):
        out = pd.get_dummies(df, columns=list(df.select_dtypes(include=['object','category']).columns), drop_first=True)
        return out

if detect_problem_type is None:
    def detect_problem_type(df, target):
        y = df[target]
        return 'regression' if (y.dtype.kind in 'ifu' and y.nunique()>20) else 'classification'

if train_automl is None:
    def train_automl(df, target, test_size=0.2, random_state=42, **kwargs):
        prob = detect_problem_type(df, target)
        X = df.drop(columns=[target]); y = df[target]
        Xtr,Xte,Ytr,Yte = train_test_split(X,y,test_size=test_size,random_state=random_state, stratify=y if prob=='classification' else None)
        best=None; best_metrics={}; best_score=-np.inf if prob=='classification' else np.inf
        cands=[]
        if prob=='classification':
            if lgb: cands.append(lgb.LGBMClassifier(random_state=random_state, verbose=-1))
            if xgb: cands.append(xgb.XGBClassifier(random_state=random_state, eval_metric='logloss', verbosity=0))
            cands.append(RandomForestClassifier(n_estimators=300, random_state=random_state, n_jobs=-1))
        else:
            if lgb: cands.append(lgb.LGBMRegressor(random_state=random_state, verbose=-1))
            if xgb: cands.append(xgb.XGBRegressor(random_state=random_state, verbosity=0))
            cands.append(RandomForestRegressor(n_estimators=300, random_state=random_state, n_jobs=-1))
        for m in cands:
            m.fit(Xtr,Ytr); yp=m.predict(Xte)
            if prob=='classification':
                try: proba=m.predict_proba(Xte)
                except Exception: proba=None
                met={'accuracy':accuracy_score(Yte,yp),'precision':precision_score(Yte,yp,average='weighted',zero_division=0),
                     'recall':recall_score(Yte,yp,average='weighted',zero_division=0),'f1':f1_score(Yte,yp,average='weighted',zero_division=0)}
                score=met['f1']
                if score>best_score: best, best_metrics, best_score = m, met, score
            else:
                met={'rmse':float(np.sqrt(mean_squared_error(Yte,yp))),'mae':float(mean_absolute_error(Yte,yp)),
                     'r2':float(r2_score(Yte,yp))}
                score=-met['rmse']
                if score> -best_score: best, best_metrics, best_score = m, met, met['rmse']
        return best, best_metrics, prob

if get_shap_values is None or plot_shap_summary is None:
    def get_shap_values(model, X):
        if not _HAS_SHAP: raise RuntimeError("SHAP not installed")
        try:
            explainer = shap.TreeExplainer(model); return explainer(X)
        except Exception:
            ke = shap.KernelExplainer(model.predict, X.sample(min(100,len(X)), random_state=42))
            return ke.shap_values(X.sample(min(200,len(X)), random_state=42))
    def plot_shap_summary(shap_values, X):
        if hasattr(shap_values,'values'):
            shap.plots.beeswarm(shap_values, max_display=20, show=False); return plt.gcf()
        shap.summary_plot(shap_values, X, show=False, max_display=20); return plt.gcf()

if forecast is None:
    def forecast(df, target, horizon, freq='D', **kwargs):
        try:
            try:
                from prophet import Prophet
            except Exception:
                from fbprophet import Prophet  # type: ignore
            tmp = df.rename(columns={'date':'ds', target:'y'})[['ds','y']]
            m = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
            m.fit(tmp); fut = m.make_future_dataframe(periods=horizon, freq=freq); fc=m.predict(fut)
            return m, fc
        except Exception:
            series = df[target].astype(float).values; last=float(series[-1]) if len(series) else 0.0
            dates = pd.date_range(df['date'].iloc[-1]+pd.Timedelta(1,unit=freq), periods=horizon, freq=freq)
            std = float(np.std(series[-min(30,len(series)):] or np.array([1.0])))
            fc = pd.DataFrame({'ds':dates,'yhat':np.full(horizon,last),'yhat_lower':last-1.96*std,'yhat_upper':last+1.96*std})
            class N: pass
            return N(), fc

if detect_anomalies is None:
    def detect_anomalies(df, method='isolation_forest', contamination=0.05):
        from sklearn.ensemble import IsolationForest
        from sklearn.neighbors import LocalOutlierFactor
        X = df.select_dtypes(include=[np.number])
        out = df.copy()
        if method=='isolation_forest':
            iso = IsolationForest(contamination=contamination, random_state=42)
            out['is_anomaly'] = (iso.fit_predict(X)==-1).astype(int)
        elif method=='lof':
            lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination)
            out['is_anomaly'] = (lof.fit_predict(X)==-1).astype(int)
        else:
            z = np.abs(stats.zscore(X, nan_policy='omit'))
            out['is_anomaly'] = ((z>3).any(axis=1)).astype(int)
        return out

# Simple registry fallback
_REGISTRY = Path('models/registry.json')
if register_model is None:
    def register_model(model_path, target, problem_type, metrics, tags, extra):
        _REGISTRY.parent.mkdir(parents=True, exist_ok=True)
        reg = (json.load(open(_REGISTRY)) if _REGISTRY.exists() else [])
        entry = {'id': f"mdl_{int(time.time())}", 'path': model_path, 'target': target, 'problem_type': problem_type,
                 'algorithm': Path(model_path).stem, 'timestamp': datetime.now().isoformat(), 'metrics': metrics,
                 'tags': tags, 'extra': extra}
        reg.append(entry); json.dump(reg, open(_REGISTRY,'w'), indent=2); return entry
if list_models is None:
    def list_models(): return (json.load(open(_REGISTRY)) if _REGISTRY.exists() else [])
if get_best_model is None:
    def get_best_model(problem_type, metric='f1'):
        models = [m for m in list_models() if m['problem_type']==problem_type]
        if not models: return None
        reverse = (metric!='rmse')
        models.sort(key=lambda m: m['metrics'].get(metric, -np.inf if reverse else np.inf), reverse=reverse)
        return models[0]
if load_model is None:
    def load_model(model_id):
        import joblib
        for m in list_models():
            if m['id']==model_id: return joblib.load(m['path'])
        return None

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
print("✅ Imports OK")

### Helper Functions

In [None]:
def evaluate_classification(y_true, y_pred, y_pred_proba=None):
    metrics = {'accuracy': accuracy_score(y_true, y_pred),
               'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
               'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
               'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0)}
    if y_pred_proba is not None:
        try: metrics['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
        except Exception: metrics['roc_auc'] = None
    return metrics

def evaluate_regression(y_true, y_pred):
    return {'rmse': float(np.sqrt(mean_squared_error(y_true, y_pred))),
            'mae': float(mean_absolute_error(y_true, y_pred)),
            'r2': float(r2_score(y_true, y_pred))}

def print_metrics(metrics: Dict[str, float], title: str = "Model Metrics"):
    print("\n" + "="*60); print(f"📊 {title}"); print("="*60)
    for k,v in metrics.items(): print(f"{k.upper():12s}: {v:.4f}" if v is not None else f"{k.upper():12s}: N/A")
    print("="*60)

def compare_models(results: List[Dict[str, Any]], metric: str='accuracy'):
    df = pd.DataFrame(results).sort_values(metric, ascending=(metric=='rmse'))
    display(df)
    fig = go.Figure()
    fig.add_trace(go.Bar(x=df['model_name'], y=df[metric], text=df[metric].round(4), textposition='auto'))
    fig.update_layout(title=f"Model Comparison: {metric.upper()}", template="plotly_white")
    fig.show()
    return df

def plot_confusion_matrix(y_true, y_pred, labels=None):
    cm = confusion_matrix(y_true, y_pred)
    fig = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"), x=labels, y=labels, text_auto=True)
    fig.update_layout(title="Confusion Matrix", template="plotly_white"); fig.show()

print("✅ Helper functions loaded!")

## 2. Data Loading & Preprocessing

In [None]:
def generate_classification_data(n_samples=1000, n_features=12, n_classes=2, random_state=42):
    from sklearn.datasets import make_classification
    X,y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=int(n_features*0.7),
                              n_redundant=int(n_features*0.2), n_classes=n_classes, random_state=random_state, flip_y=0.05)
    df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)]); df['target']=y
    df['category_1'] = np.random.choice(['A','B','C'], n_samples); df['category_2'] = np.random.choice(['X','Y'], n_samples)
    return df

def generate_regression_data(n_samples=1000, n_features=12, noise=10, random_state=42):
    from sklearn.datasets import make_regression
    X,y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_features*0.7),
                          noise=noise, random_state=random_state)
    df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)]); df['target']=y
    df['category_1'] = np.random.choice(['A','B','C'], n_samples); df['category_2'] = np.random.choice(['X','Y'], n_samples)
    return df

TASK_TYPE = 'classification'  # or 'regression'
df = generate_classification_data() if TASK_TYPE=='classification' else generate_regression_data()
print("📊 Dataset generated:", df.shape)
df.head()

In [None]:
print("🧹 Preprocessing...")
df_clean = clean_data(df)
df_engineered = engineer_features(df_clean)
print("✅ Preprocessed:", df_engineered.shape)

In [None]:
TARGET='target'; TEST_SIZE=0.2; VALIDATION_SIZE=0.1
X = df_engineered.drop(columns=[TARGET]); y = df_engineered[TARGET]
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X,y, test_size=TEST_SIZE+VALIDATION_SIZE, random_state=42,
                                                              stratify=y if TASK_TYPE=='classification' else None)
val_ratio = VALIDATION_SIZE/(TEST_SIZE+VALIDATION_SIZE)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp, test_size=(1-val_ratio), random_state=42,
                                                stratify=y_temp if TASK_TYPE=='classification' else None)
print(f"Train/Val/Test: {len(X_train_full)}/{len(X_val)}/{len(X_test)} | Features: {X_train_full.shape[1]}")

## 3. Baseline Models

In [None]:
results=[]
from sklearn.dummy import DummyClassifier, DummyRegressor

if TASK_TYPE=='classification':
    base=DummyClassifier(strategy='most_frequent')
else:
    base=DummyRegressor(strategy='mean')

base.fit(X_train_full, y_train_full); yb=base.predict(X_test)

met = evaluate_classification(y_test,yb) if TASK_TYPE=='classification' else evaluate_regression(y_test,yb)

met['model_name']='Baseline'; met['training_time']=0.0; results.append(met)
print_metrics(met, 'Baseline')

## 4. AutoML Pipeline

In [None]:
ptype = detect_problem_type(df_engineered, TARGET); print("📊 Problem type:", ptype)
st=time.time(); model, metrics, _ = train_automl(df_engineered, target=TARGET, test_size=TEST_SIZE, random_state=42); tt=time.time()-st
metrics['model_name']=f"AutoML ({type(model).__name__})"; metrics['training_time']=tt; results.append(metrics)
print_metrics(metrics, 'AutoML Best Model')

In [None]:
y_pred = model.predict(X_test)
if TASK_TYPE=='classification':
    plot_confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
else:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predictions', opacity=0.6))
    mn, mx = float(np.min(y_test)), float(np.max(y_test))
    fig.add_trace(go.Scatter(x=[mn,mx], y=[mn,mx], mode='lines', name='Perfect', line=dict(dash='dash')))
    fig.update_layout(title='Actual vs Predicted', template='plotly_white'); fig.show()

## 5. Hyperparameter Tuning

In [None]:
if TASK_TYPE=='classification':
    base = lgb.LGBMClassifier(random_state=42, verbose=-1) if lgb else RandomForestClassifier(n_estimators=300, random_state=42)
    scoring='f1_weighted'
else:
    base = lgb.LGBMRegressor(random_state=42, verbose=-1) if lgb else RandomForestRegressor(n_estimators=300, random_state=42)
    scoring='neg_root_mean_squared_error'
param_dist={'n_estimators':[100,200,300,500]}
rs = RandomizedSearchCV(base, param_distributions=param_dist, n_iter=6, cv=3, scoring=scoring, random_state=42, n_jobs=-1, verbose=1)
st=time.time(); rs.fit(X_train_full, y_train_full); tt=time.time()-st
best = rs.best_estimator_; yp = best.predict(X_test)
met = evaluate_classification(y_test, yp) if TASK_TYPE=='classification' else evaluate_regression(y_test, yp)
met['model_name']=f'{type(best).__name__} (Tuned)'; met['training_time']=tt; results.append(met)
print_metrics(met, 'Tuned Model')

## 6. Model Comparison

In [None]:
models_to_compare=[]
if TASK_TYPE=='classification':
    if lgb: models_to_compare.append(('LightGBM', lgb.LGBMClassifier(random_state=42, verbose=-1)))
    if xgb: models_to_compare.append(('XGBoost', xgb.XGBClassifier(random_state=42, eval_metric='logloss', verbosity=0)))
    models_to_compare.append(('Random Forest', RandomForestClassifier(n_estimators=300, random_state=42)))
else:
    if lgb: models_to_compare.append(('LightGBM', lgb.LGBMRegressor(random_state=42, verbose=-1)))
    if xgb: models_to_compare.append(('XGBoost', xgb.XGBRegressor(random_state=42, verbosity=0)))
    models_to_compare.append(('Random Forest', RandomForestRegressor(n_estimators=300, random_state=42)))
comparison_results=[]; trained_models_dict={}
for name,mdl in models_to_compare:
    st=time.time(); mdl.fit(X_train_full,y_train_full); tt=time.time()-st; yp=mdl.predict(X_test)
    met = evaluate_classification(y_test, yp) if TASK_TYPE=='classification' else evaluate_regression(y_test, yp)
    met['model_name']=name; met['training_time']=tt; comparison_results.append(met); trained_models_dict[name]=mdl
comparison_df = compare_models(comparison_results, metric=('f1' if TASK_TYPE=='classification' else 'rmse'))

## 7. Ensemble Methods

In [None]:
from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression, Ridge
base_models = models_to_compare[:2] if len(models_to_compare)>=2 else models_to_compare
if TASK_TYPE=='classification':
    ens = VotingClassifier(estimators=base_models, voting='soft'); stk = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(max_iter=1000))
else:
    ens = VotingRegressor(estimators=base_models); stk = StackingRegressor(estimators=base_models, final_estimator=Ridge())
ens.fit(X_train_full,y_train_full); stk.fit(X_train_full,y_train_full)
yp_e = ens.predict(X_test); yp_s = stk.predict(X_test)
met_e = evaluate_classification(y_test, yp_e) if TASK_TYPE=='classification' else evaluate_regression(y_test, yp_e)
met_s = evaluate_classification(y_test, yp_s) if TASK_TYPE=='classification' else evaluate_regression(y_test, yp_s)
met_e['model_name']='Voting Ensemble'; met_s['model_name']='Stacking Ensemble'
final_df = compare_models(comparison_results+[met_e, met_s], metric=('f1' if TASK_TYPE=='classification' else 'rmse'))

## 8. Feature Importance & SHAP

In [None]:
best_model_result = final_df.iloc[0]; best_model_name = best_model_result['model_name']
best_model = trained_models_dict.get(best_model_name, model)
if hasattr(best_model,'feature_importances_'):
    imp = pd.DataFrame({'feature': X_train_full.columns, 'importance': best_model.feature_importances_}).sort_values('importance', ascending=False)
    display(imp.head(10))
if _HAS_SHAP:
    try:
        X_shap = X_test.sample(n=min(300,len(X_test)), random_state=42)
        sv = get_shap_values(best_model, X_shap)
        fig = plot_shap_summary(sv, X_shap); plt.show()
    except Exception as e:
        print("SHAP failed:", e)

## 9. Model Registry

In [None]:
import joblib
models_dir = Path('models/trained_models'); models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / f"{best_model_name.replace(' ','_').lower()}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.joblib"
joblib.dump(best_model, model_path)
entry = register_model(str(model_path), TARGET, TASK_TYPE, dict(best_model_result), ['best_model','experiment'], {'features': list(X_train_full.columns)})
print("Registered:", entry['id'])

## 10. Time Series Forecasting

In [None]:
def generate_timeseries_data(n_days=365, seed=42):
    np.random.seed(seed); dates=pd.date_range('2023-01-01', periods=n_days, freq='D')
    trend=100+0.3*np.arange(n_days); weekly=15*np.sin(2*np.pi*np.arange(n_days)/7); noise=np.random.normal(0,5,n_days)
    return pd.DataFrame({'date':dates,'sales':trend+weekly+noise})
df_ts = generate_timeseries_data(500)
tr = df_ts.iloc[:400]; te = df_ts.iloc[400:]
m, fc = forecast(tr, target='sales', horizon=len(te), freq='D')
yhat = fc['yhat'].tail(len(te)).values; ytrue = te['sales'].values
rmse = float(np.sqrt(mean_squared_error(ytrue, yhat))); mae=float(mean_absolute_error(ytrue, yhat))
print("TS Metrics -> RMSE:", rmse, " MAE:", mae)

## 11. Anomaly Detection

In [None]:
df_an = df_engineered.drop(columns=[TARGET])
methods = ['isolation_forest','lof','statistical']; comp=[]
for mth in methods:
    tmp = detect_anomalies(df_an, method=mth, contamination=0.05)
    comp.append({'Method': mth.upper(), 'Anomalies': int(tmp['is_anomaly'].sum())})
disp = pd.DataFrame(comp); display(disp)

## 12. Model Deployment

In [None]:
from sklearn.pipeline import Pipeline
import joblib, zipfile
pipe = Pipeline([('scaler', StandardScaler()), ('model', best_model)])
pipe.fit(X_train_full, y_train_full)
models_dir = Path('models/trained_models'); models_dir.mkdir(parents=True, exist_ok=True)
pipe_path = models_dir / "production_pipeline.joblib"; joblib.dump(pipe, pipe_path)
# Package
export_dir = Path('data/exports/deployment'); export_dir.mkdir(parents=True, exist_ok=True)
zip_path = export_dir / f"model_deployment_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    z.write(pipe_path, arcname='pipeline.joblib')
    meta = {'model_name': best_model_name, 'created': datetime.now().isoformat(), 'features': list(X_train_full.columns)}
    meta_path = export_dir / "metadata.json"; json.dump(meta, open(meta_path,'w'), indent=2); z.write(meta_path, arcname='metadata.json')
print("Deployment package:", zip_path.resolve())

## 🎯 Experiment Summary

End-to-end ML experimentation: baseline → AutoML → HPO → comparison → ensembles → SHAP → TS → anomaly → deployment.

In [None]:
print("\n" + "="*70); print("🎉 MODEL EXPERIMENTS NOTEBOOK COMPLETE!"); print("="*70)