In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import os
import joblib 
import warnings

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge, Lasso 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from xgboost import XGBClassifier, XGBRegressor 
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, 
    mean_squared_error, mean_absolute_error, r2_score 
)
from sklearn.base import clone

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("--- Part 1: Setup & Configuration ---")

RAW_DATA_PATH = 'data/raw/afgh_may25.csv'
TAKEOVER_DATE_STR = "2021-08-15"
GRID_CELL_SIZE_DEG = 1.0
HISTORY_LAGS_DAYS = [7, 30, 90]
MIN_ACTOR_FREQ = 10 
MAX_DAYS_SINCE_LAST_EVENT = 180 

TEST_SET_FRAC = 0.15 
VALIDATION_SET_FRAC_FROM_DEV = 0.15 

N_CV_SPLITS = 3 
PRIMARY_METRIC_CLASSIFICATION = 'f1_weighted' 
PRIMARY_METRIC_REGRESSION = 'neg_root_mean_squared_error' 

seed = 42
np.random.seed(seed) 

RESULTS_DIR = 'results/'
PLOTS_DIR = "visualizations/"
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)
sns.set_theme(style="whitegrid")

print("Configuration loaded.")

In [None]:
# --- Part 2: Load and Initial Clean ACLED Data ---
print("\n--- Part 2: Load and Initial Clean ACLED Data ---")
try:
    df = pd.read_csv(RAW_DATA_PATH, low_memory=False)
    print(f"Loaded ACLED data. Initial shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: ACLED data file '{RAW_DATA_PATH}' not found. Halting.")
    df = pd.DataFrame() 

if not df.empty:
    df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
    df.dropna(subset=['event_date'], inplace=True)

    numeric_cols_convert = ['latitude', 'longitude', 'fatalities', 'geo_precision', 'time_precision']
    for col in numeric_cols_convert:
        if col in df.columns: 
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df['fatalities'] = df['fatalities'].fillna(0).astype(int)
    df['log_fatalities'] = np.log1p(df['fatalities'])

    def classify_fatalities(x):
        if x == 0: return 'none'
        elif x == 1: return 'low'
        else: return 'serious' 
    df['fatality_level'] = df['fatalities'].apply(classify_fatalities)
    
    df.sort_values('event_date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    print(f"Data initially cleaned. Shape: {df.shape}")
else:
    print("DataFrame is empty after loading or initial cleaning. Skipping further steps.")

In [None]:
# --- Part 3: Feature Engineering ---
print("\n--- Part 3: Feature Engineering ---")

if not df.empty:
    print("Engineering temporal features...")
    df['year'] = df['event_date'].dt.year
    df['month'] = df['event_date'].dt.month
    df['dayofweek'] = df['event_date'].dt.dayofweek
    df['dayofyear'] = df['event_date'].dt.dayofyear
    df['is_post_takeover'] = (df['event_date'] >= pd.to_datetime(TAKEOVER_DATE_STR)).astype(int)

    print("Engineering spatial features (grid_cell_id)...")
    def add_grid_cell_id(df_input, lat_col, lon_col, cell_size_deg):
        df_out = df_input.copy()
        df_with_coords = df_out.dropna(subset=[lat_col, lon_col]).copy()
        
        if df_with_coords.empty: 
            df_out['grid_cell_id'] = f"cell_NaN_NaN"
            return df_out

        min_lat, max_lat = df_with_coords[lat_col].min(), df_with_coords[lat_col].max()
        min_lon, max_lon = df_with_coords[lon_col].min(), df_with_coords[lon_col].max()
        
        eps = 1e-9 
        lat_bins = np.arange(min_lat, max_lat + cell_size_deg - eps, cell_size_deg)
        lon_bins = np.arange(min_lon, max_lon + cell_size_deg - eps, cell_size_deg)

        if len(lat_bins) < 2: lat_bins = np.array([min_lat, max_lat + eps])
        if len(lon_bins) < 2: lon_bins = np.array([min_lon, max_lon + eps])

        df_with_coords['lat_idx'] = pd.cut(df_with_coords[lat_col], bins=lat_bins, labels=False, include_lowest=True, right=False)
        df_with_coords['lon_idx'] = pd.cut(df_with_coords[lon_col], bins=lon_bins, labels=False, include_lowest=True, right=False)
        
        df_with_coords['gc_temp'] = df_with_coords.apply(
            lambda r: f"c_{int(r['lat_idx'])}_{int(r['lon_idx'])}" if pd.notna(r['lat_idx']) and pd.notna(r['lon_idx']) else "c_NaN_NaN", axis=1)
        
        df_out = df_out.join(df_with_coords[['gc_temp']])
        df_out.rename(columns={'gc_temp': 'grid_cell_id'}, inplace=True)
        df_out['grid_cell_id'].fillna("c_NaN_NaN", inplace=True) 
        return df_out
    df = add_grid_cell_id(df, 'latitude', 'longitude', GRID_CELL_SIZE_DEG)

    print("Engineering actor features (actor1_grouped for rare actors)...")
    def encode_actors_grouped(df_in, actor_col, min_f):
        df_o = df_in.copy()
        nan_placeholder = "ACTOR_UNKNOWN"
        df_o[actor_col] = df_o[actor_col].fillna(nan_placeholder)
        
        actor_counts = df_o[actor_col].value_counts()
        rare_actors = actor_counts[actor_counts < min_f].index
        
        df_o[f'{actor_col}_grouped'] = df_o[actor_col].apply(
            lambda x: 'Other_Actor' if x in rare_actors else x)
        return df_o
    df = encode_actors_grouped(df, 'actor1', MIN_ACTOR_FREQ)

    print("Engineering lagged/trend features (event counts & fatality sums)...")
    def engineer_lagged_features(df_input, unit_col, date_col='event_date', lag_windows=None):
        if lag_windows is None: lag_windows = [7, 30, 90]

        df_out = df_input.copy()
        id_col_for_count = 'event_id_cnty' if 'event_id_cnty' in df_out.columns else df_out.columns[0]
        
        daily_agg = df_out.groupby([unit_col, pd.Grouper(key=date_col, freq='D')], observed=False).agg(
            _daily_event_count=(id_col_for_count, 'size'), 
            _daily_fatalities_sum=('fatalities', 'sum')
        ).reset_index()
        
        daily_agg = daily_agg.set_index([unit_col, date_col]).sort_index()
        
        lagged_results_all_units = []
        for unit_name, group_df_unit in daily_agg.groupby(level=0): 
            group_df_unit = group_df_unit.reset_index(level=0, drop=True) 
            for lag in lag_windows:
                win = f'{lag}D' 
                group_df_unit[f'event_count_{unit_col}_lag{lag}d'] = group_df_unit['_daily_event_count'].shift(1).rolling(win, min_periods=1).sum().fillna(0).astype(int)
                group_df_unit[f'sum_fatalities_{unit_col}_lag{lag}d'] = group_df_unit['_daily_fatalities_sum'].shift(1).rolling(win, min_periods=1).sum().fillna(0)
            lagged_results_all_units.append(group_df_unit.reset_index().assign(**{unit_col: unit_name}))
        
        if not lagged_results_all_units: 
            return df_out

        processed_daily_lags = pd.concat(lagged_results_all_units, ignore_index=True)
        
        df_out['_day_only_date'] = df_out[date_col].dt.normalize() 
        
        cols_to_merge = [unit_col, date_col] + [col for col in processed_daily_lags.columns if 'lag' in col and unit_col in col]
        cols_to_merge = list(dict.fromkeys(cols_to_merge))

        if date_col in processed_daily_lags.columns: processed_daily_lags[date_col] = pd.to_datetime(processed_daily_lags[date_col]).dt.normalize()
        df_out = pd.merge(df_out, processed_daily_lags[cols_to_merge], 
                          left_on=[unit_col, '_day_only_date'], right_on=[unit_col, date_col],
                          how='left', suffixes=('', '_daily'))
        if f'{date_col}_daily' in df_out.columns: df_out.drop(columns=[f'{date_col}_daily'], inplace=True)
        df_out.drop(columns=['_day_only_date'], inplace=True)
        
        for col in df_out.columns:
            if ('lag' in col) and (unit_col in col): df_out[col].fillna(0, inplace=True)
        return df_out

    if 'admin1' in df.columns: df = engineer_lagged_features(df, 'admin1', lag_windows=HISTORY_LAGS_DAYS)
    if 'grid_cell_id' in df.columns: df = engineer_lagged_features(df, 'grid_cell_id', lag_windows=HISTORY_LAGS_DAYS)

    print("Engineering time since last event features...")
    df.sort_values('event_date', inplace=True) 
    
    if 'admin1' in df.columns:
        df.sort_values(['admin1', 'event_date'], inplace=True)
        df['days_since_last_event_in_admin1'] = df.groupby('admin1')['event_date'].diff().dt.days.fillna(MAX_DAYS_SINCE_LAST_EVENT).clip(upper=MAX_DAYS_SINCE_LAST_EVENT)
    
    if 'grid_cell_id' in df.columns:
        df_temp_grid_ts = df[df['grid_cell_id'] != "c_NaN_NaN"].copy() 
        if not df_temp_grid_ts.empty:
            df_temp_grid_ts.sort_values(['grid_cell_id', 'event_date'], inplace=True)
            df_temp_grid_ts['tsle_grid'] = df_temp_grid_ts.groupby('grid_cell_id')['event_date'].diff().dt.days
            df = df.set_index(df.index).join(df_temp_grid_ts[['tsle_grid']]) 
            df.rename(columns={'tsle_grid': 'days_since_last_event_in_grid_cell'}, inplace=True)
        if 'days_since_last_event_in_grid_cell' not in df.columns: 
            df['days_since_last_event_in_grid_cell'] = MAX_DAYS_SINCE_LAST_EVENT
        df['days_since_last_event_in_grid_cell'] = df['days_since_last_event_in_grid_cell'].fillna(MAX_DAYS_SINCE_LAST_EVENT).clip(upper=MAX_DAYS_SINCE_LAST_EVENT)

    df.sort_values('event_date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"Feature engineering complete. Shape: {df.shape}")
else:
    print("DataFrame is empty. Skipping feature engineering.")

In [None]:
# --- Part 4: Feature Selection & Defining X, y ---
print("\n--- Part 4: Feature Selection & Defining X, y ---")

if not df.empty:
    TARGET_COL_CLASSIFICATION = 'fatality_level'
    TARGET_COL_REGRESSION = 'log_fatalities' 

    categorical_features_for_ohe = [
        'event_type', 'sub_event_type', 'admin1', 'grid_cell_id', 'actor1_grouped',
        'year', 'month', 'dayofweek', 'disorder_type', 'interaction', 'source_scale'
    ]
    numerical_features_base = [
        'latitude', 'longitude', 'geo_precision', 'time_precision', 'dayofyear', 'is_post_takeover'
    ]
    
    lag_trend_time_since_features = [col for col in df.columns if ('lag' in col or 'trend' in col or 'days_since_last_event' in col)]

    categorical_features = [col for col in categorical_features_for_ohe if col in df.columns]
    numerical_features = list(dict.fromkeys([col for col in numerical_features_base if col in df.columns] + 
                                             [col for col in lag_trend_time_since_features if col in df.columns]))
    
    feature_columns = list(dict.fromkeys([col for col in (categorical_features + numerical_features) if col in df.columns]))

    critical_originals_for_nan_check = ['latitude', 'longitude', 'geo_precision', 'time_precision', 'event_type', 'sub_event_type', 'admin1']
    existing_critical_for_nan_check = [col for col in critical_originals_for_nan_check if col in df.columns]
    if existing_critical_for_nan_check: 
        df.dropna(subset=existing_critical_for_nan_check, inplace=True)
        print(f"Dropped rows with NaNs in critical features. Shape: {df.shape}")
    else:
        print("No critical original features found for NaN drop check.")
else:
    print("DataFrame is empty. Skipping feature selection.")

In [None]:
# --- Part 5: Data Splitting ---
print("\n--- Part 5: Data Splitting ---")

if not df.empty:
    df.sort_values('event_date', inplace=True)
    df.reset_index(drop=True, inplace=True)

    n_total = len(df)
    n_test = int(n_total * TEST_SET_FRAC)
    n_dev = n_total - n_test 

    n_val = int(n_dev * VALIDATION_SET_FRAC_FROM_DEV) 
    n_train_cv = n_dev - n_val 

    df_train_cv = df.iloc[:n_train_cv].copy()
    df_val_es = df.iloc[n_train_cv:n_dev].copy()
    df_test = df.iloc[n_dev:].copy()

    print(f"Dataset split completed:")
    print(f"  Train_CV set size: {len(df_train_cv)} events")
    print(f"  Validation_ES set size: {len(df_val_es)} events")
    print(f"  Test set size: {len(df_test)} events")
else:
    print("DataFrame is empty. Skipping data splitting.")
    df_train_cv, df_val_es, df_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

In [None]:
# --- Part 6: Target Variable Encoding ---
print("\n--- Part 6: Target Variable Encoding ---")

le = LabelEncoder()

all_y_labels_list = []
if not df_train_cv.empty: all_y_labels_list.extend(df_train_cv[TARGET_COL_CLASSIFICATION].tolist())
if not df_val_es.empty: all_y_labels_list.extend(df_val_es[TARGET_COL_CLASSIFICATION].tolist())
if not df_test.empty: all_y_labels_list.extend(df_test[TARGET_COL_CLASSIFICATION].tolist())

if all_y_labels_list:
    le.fit(list(set(all_y_labels_list)))
else: 
    print("Warning: No labels found. Using default classes.")
    le.fit(['none','low','serious']) 

print(f"LabelEncoder classes: {list(le.classes_)} -> {np.arange(len(le.classes_))}")

y_train_cv_class_encoded = pd.Series(le.transform(df_train_cv[TARGET_COL_CLASSIFICATION]), index=df_train_cv.index) if not df_train_cv.empty else pd.Series(dtype=int)
y_val_es_class_encoded = pd.Series(le.transform(df_val_es[TARGET_COL_CLASSIFICATION]), index=df_val_es.index) if not df_val_es.empty else pd.Series(dtype=int)
y_test_class_encoded = pd.Series(le.transform(df_test[TARGET_COL_CLASSIFICATION]), index=df_test.index) if not df_test.empty else pd.Series(dtype=int)

y_train_cv_reg = df_train_cv[TARGET_COL_REGRESSION] if not df_train_cv.empty else pd.Series(dtype=float)
y_val_es_reg = df_val_es[TARGET_COL_REGRESSION] if not df_val_es.empty else pd.Series(dtype=float)
y_test_reg = df_test[TARGET_COL_REGRESSION] if not df_test.empty else pd.Series(dtype=float)

print("Target variables encoded/assigned.")

In [None]:
# --- Part 7: Feature Preprocessing ---
print("\n--- Part 7: Feature Preprocessing ---")

X_train_cv = df_train_cv[feature_columns] if not df_train_cv.empty else pd.DataFrame(columns=feature_columns)
X_val_es = df_val_es[feature_columns] if not df_val_es.empty else pd.DataFrame(columns=feature_columns)
X_test = df_test[feature_columns] if not df_test.empty else pd.DataFrame(columns=feature_columns)

if not X_train_cv.empty:
    categorical_features = [col for col in categorical_features if col in X_train_cv.columns and X_train_cv[col].nunique(dropna=False) > 0]
    numerical_features = [col for col in numerical_features if col in X_train_cv.columns and (X_train_cv[col].nunique(dropna=False) > 1 if pd.api.types.is_numeric_dtype(X_train_cv[col]) else True) ]
    
    feature_columns = list(dict.fromkeys(categorical_features + numerical_features))
    X_train_cv = X_train_cv[feature_columns]
    X_val_es = X_val_es[feature_columns]
    X_test = X_test[feature_columns]
else: 
    categorical_features = []
    numerical_features = []
    print("X_train_cv is empty, preprocessor will not be fitted meaningfully.")

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=0.005), categorical_features),
    ('num', StandardScaler(), numerical_features)], 
    remainder='drop' 
)

if not X_train_cv.empty:
    print("Fitting preprocessor and transforming data...")
    X_train_cv_processed = preprocessor.fit_transform(X_train_cv)
    
    X_val_es_processed = preprocessor.transform(X_val_es) if not X_val_es.empty else np.array([]).reshape(0, X_train_cv_processed.shape[1])
    X_test_processed = preprocessor.transform(X_test) if not X_test.empty else np.array([]).reshape(0, X_train_cv_processed.shape[1])
    
    processed_feature_names = preprocessor.get_feature_names_out()
    
    print(f"Processed shapes: Train {X_train_cv_processed.shape}, Val {X_val_es_processed.shape}, Test {X_test_processed.shape}")
    print(f"Number of features after OHE/Scaling: {X_train_cv_processed.shape[1]}")
else: 
    X_train_cv_processed = np.array([])
    X_val_es_processed = np.array([])
    X_test_processed = np.array([])
    processed_feature_names = []
    print("Skipped fitting/transforming due to empty X_train_cv.")

print("Feature preprocessing complete.")

In [None]:
# --- Part 8: Model Training & Evaluation Pipeline Function ---
print("\n--- Part 8: Model Training & Evaluation Pipeline Function Definition ---")

def run_modeling_pipeline(
    X_train_proc, y_train, 
    X_val_proc, y_val, 
    X_test_proc, y_test, 
    model_instance, model_name_prefix, task_type, 
    param_grid=None, cv_splitter=None, primary_scoring_metric=None, 
    feature_names_processed=None, random_state_seed=42
):
    
    model_full_name = f"{type(model_instance).__name__}{model_name_prefix}"
    print(f"\n--- Running Pipeline for: {model_full_name} ({task_type}) ---")
    results = {'Model': model_full_name, 'Task': task_type, 'CV_Score': np.nan, 'Best_Params': '{}'}
    best_model = None
    
    current_model_instance = clone(model_instance)
    if hasattr(current_model_instance, 'random_state') and current_model_instance.random_state is None:
        current_model_instance.random_state = random_state_seed
    if hasattr(current_model_instance, 'seed') and current_model_instance.seed is None:
        current_model_instance.seed = random_state_seed

    val_set_avail = isinstance(X_val_proc, np.ndarray) and X_val_proc.size > 0 and \
                    isinstance(y_val, (pd.Series, np.ndarray)) and len(y_val) > 0
    
    orig_model_uses_es = 'early_stopping_rounds' in model_instance.get_params() and \
                         model_instance.get_params()['early_stopping_rounds'] is not None

    fit_params_gscv = {}
    if orig_model_uses_es and val_set_avail:
        fit_params_gscv['eval_set'] = [(X_val_proc, y_val)]
        if type(current_model_instance).__name__.startswith('XGB'): 
            fit_params_gscv['verbose'] = False 
        print("    (Validation set for GridSearchCV early stopping)")
    elif orig_model_uses_es and not val_set_avail:
        if hasattr(current_model_instance, 'early_stopping_rounds'): 
            current_model_instance.early_stopping_rounds = None
        print("    (No validation set, ES disabled on clone)")

    if param_grid and cv_splitter and primary_scoring_metric:
        print(f"  Running GridSearchCV for {model_full_name}...")
        gscv = GridSearchCV(
            current_model_instance, param_grid, cv=cv_splitter, 
            scoring=primary_scoring_metric, n_jobs=-1, verbose=0, error_score='raise'
        )
        try: 
            gscv.fit(X_train_proc, y_train, **fit_params_gscv)
            best_model = gscv.best_estimator_
            results['Best_Params'] = str(gscv.best_params_)
            results['CV_Score'] = gscv.best_score_
            print(f"    Best parameters: {results['Best_Params']}")
            print(f"    Best CV ({primary_scoring_metric}): {results['CV_Score']:.4f}")
        except Exception as e: 
            print(f"    ERROR during GridSearchCV for {model_full_name}: {e}\n    Fallback to direct fit.")
            best_model = None 
    
    if best_model is None: 
        print(f"  Training {model_full_name} (direct fit)...")
        fallback_model = clone(model_instance) 
        if hasattr(fallback_model, 'random_state') and fallback_model.random_state is None:
            fallback_model.random_state = random_state_seed
        if hasattr(fallback_model, 'seed') and fallback_model.seed is None:
            fallback_model.seed = random_state_seed
        try:
            if orig_model_uses_es and val_set_avail: 
                fallback_model.fit(X_train_proc, y_train, eval_set=[(X_val_proc, y_val)], verbose=False)
            elif orig_model_uses_es and not val_set_avail: 
                if hasattr(fallback_model, 'early_stopping_rounds'): fallback_model.early_stopping_rounds = None
                fallback_model.fit(X_train_proc, y_train, verbose=False) 
            else: 
                fallback_model.fit(X_train_proc, y_train)
            best_model = fallback_model
        except Exception as e: 
            print(f"    ERROR during direct fit for {model_full_name}: {e}")
            return {**results, 'ModelObject': None, 'Importances': None} 
    
    if not best_model: 
        print(f"MODEL TRAINING FAILED FOR {model_full_name}. Skipping evaluation.")
        return {**results, 'ModelObject': None, 'Importances': None}

    results['ModelObject'] = best_model

    if isinstance(X_test_proc, np.ndarray) and X_test_proc.size > 0 and \
       isinstance(y_test, (pd.Series, np.ndarray)) and len(y_test) > 0:
        y_pred = best_model.predict(X_test_proc)
        
        if task_type == 'classification':
            results.update({
                'Accuracy': accuracy_score(y_test, y_pred), 
                'F1-Weighted': f1_score(y_test, y_pred, average='weighted', zero_division=0),
                'Precision-W': precision_score(y_test, y_pred, average='weighted', zero_division=0), 
                'Recall-W': recall_score(y_test, y_pred, average='weighted', zero_division=0)
            })
            if hasattr(best_model, "predict_proba"):
                y_prob = best_model.predict_proba(X_test_proc)
                roc_auc = np.nan
                try:
                    unique_labels_test = np.unique(y_test)
                    if len(unique_labels_test) > 2 and y_prob.shape[1] == len(unique_labels_test): 
                        roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted', labels=unique_labels_test)
                    elif y_prob.shape[1] == 2: 
                        roc_auc = roc_auc_score(y_test, y_prob[:, 1])
                except ValueError as e_roc: 
                    print(f"  ROC AUC error for {model_full_name}: {e_roc}")
                results['ROC_AUC'] = roc_auc
            print(f"    Test Metrics - F1-W: {results.get('F1-Weighted',np.nan):.4f}, Accuracy: {results.get('Accuracy',np.nan):.4f}, ROC_AUC: {results.get('ROC_AUC',np.nan):.4f}")
        
        elif task_type == 'regression':
            results.update({
                'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)), 
                'MAE': mean_absolute_error(y_test, y_pred), 
                'R2': r2_score(y_test, y_pred)
            })
            print(f"    Test Metrics - RMSE: {results.get('RMSE',np.nan):.4f}, R2: {results.get('R2',np.nan):.4f}")
    else: 
        print(f"  Test set empty or invalid for {model_full_name}, skipping test evaluation.")

    fi_df = None
    importances_data = None
    if hasattr(best_model, 'feature_importances_'): 
        importances_data = best_model.feature_importances_
    elif hasattr(best_model, 'coef_'): 
        importances_data = np.mean(np.abs(best_model.coef_), axis=0) if best_model.coef_.ndim > 1 else np.abs(best_model.coef_)
    
    if importances_data is not None and feature_names_processed is not None and \
       len(importances_data) == len(feature_names_processed):
        fi_df = pd.DataFrame({
            'feature': feature_names_processed,
            'importance': importances_data
        }).sort_values('importance', ascending=False).reset_index(drop=True)
    
    results['Importances'] = fi_df.head(5) if fi_df is not None else "Not Available"
    
    print(f"--- Pipeline for {model_full_name} complete. ---")
    return results

In [None]:
# --- Part 9: Model Initialization ---
print("\n--- Part 9: Initializing Models & Grids ---")

all_models_to_run = []
cv_splitter = TimeSeriesSplit(n_splits=N_CV_SPLITS)

# CLASSIFICATION MODELS
all_models_to_run.append({'name': "Dummy_Frequent", 'task': 'classification', 'model': DummyClassifier(strategy='most_frequent', random_state=seed), 'grid': None})
all_models_to_run.append({'name': "Dummy_Stratified", 'task': 'classification', 'model': DummyClassifier(strategy='stratified', random_state=seed), 'grid': None})

lr_params = {
    'C': [0.1, 1, 10], 
    'penalty': ['l2'], 
    'solver': ['lbfgs'], 
    'class_weight': ['balanced', None], 
    'max_iter': [1000]
}
all_models_to_run.append({'name': "LogisticRegression", 'task': 'classification', 'model': LogisticRegression(random_state=seed), 'grid': lr_params})

dt_params = {
    'max_depth': [5, 10, None], 
    'min_samples_split': [2, 10], 
    'class_weight': ['balanced', None]
}
all_models_to_run.append({'name': "DecisionTree", 'task': 'classification', 'model': DecisionTreeClassifier(random_state=seed), 'grid': dt_params})

rf_gs_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_leaf': [1, 5],
    'class_weight': ['balanced', None]
}
all_models_to_run.append({'name': "RandomForest_GS", 'task': 'classification', 'model': RandomForestClassifier(random_state=seed, n_jobs=-1), 'grid': rf_gs_params})

all_models_to_run.append({'name': "RandomForest_Pruned", 'task': 'classification', 'model': RandomForestClassifier(
    n_estimators=100, max_depth=8, min_samples_split=10, min_samples_leaf=4, 
    random_state=seed, n_jobs=-1, class_weight='balanced', max_features='sqrt'
), 'grid': None})

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}
all_models_to_run.append({'name': "XGBoost", 'task': 'classification', 'model': XGBClassifier(random_state=seed, objective='multi:softprob', eval_metric='mlogloss'), 'grid': xgb_params})

# REGRESSION MODELS
all_models_to_run.append({'name': "Dummy_Mean_Reg", 'task': 'regression', 'model': DummyRegressor(strategy='mean'), 'grid': None})
all_models_to_run.append({'name': "Dummy_Median_Reg", 'task': 'regression', 'model': DummyRegressor(strategy='median'), 'grid': None})

ridge_params = {'alpha': [0.1, 1.0, 10.0, 100.0]}
all_models_to_run.append({'name': "Ridge_Reg", 'task': 'regression', 'model': Ridge(random_state=seed), 'grid': ridge_params})

lasso_params = {'alpha': [0.001, 0.01, 0.1, 1.0]} 
all_models_to_run.append({'name': "Lasso_Reg', 'task': 'regression', 'model': Lasso(random_state=seed, max_iter=5000), 'grid': lasso_params}) 

rf_reg_params = {
    'n_estimators': [100, 200], 
    'max_depth': [5, 10, None], 
    'min_samples_leaf': [1, 5]
}
all_models_to_run.append({'name': "RandomForest_Reg", 'task': 'regression', 'model': RandomForestRegressor(random_state=seed, n_jobs=-1), 'grid': rf_reg_params})

xgb_reg_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1]
}
all_models_to_run.append({'name': "XGBoost_Reg', 'task': 'regression', 'model': XGBRegressor(random_state=seed, objective='reg:squarederror', eval_metric='rmse'), 'grid': xgb_reg_params})

print("All models initialized with their respective parameter grids.")

In [None]:
# --- Part 10: Running Modeling Loop ---
print("\n--- Running Modeling Loop ---")

final_results_list = []

for model_spec in all_models_to_run:
    task_y_train = y_train_cv_class_encoded if model_spec['task'] == 'classification' else y_train_cv_reg
    task_y_val = y_val_es_class_encoded if model_spec['task'] == 'classification' else y_val_es_reg
    task_y_test = y_test_class_encoded if model_spec['task'] == 'classification' else y_test_reg
    primary_metric = PRIMARY_METRIC_CLASSIFICATION if model_spec['task'] == 'classification' else PRIMARY_METRIC_REGRESSION
    
    if task_y_train.empty or (isinstance(X_train_cv_processed, np.ndarray) and X_train_cv_processed.size == 0):
        print(f"Skipping {model_spec['name']} ({model_spec['task']}) due to empty or invalid training data.")
        final_results_list.append({
            'Model': f"{type(model_spec['model']).__name__}_{model_spec['name']}", 
            'Task': model_spec['task'], 
            'CV_Score': np.nan, 
            'Best_Params': 'Skipped - No Train Data'
        })
        continue
        
    result = run_modeling_pipeline(
        X_train_cv_processed, task_y_train,
        X_val_es_processed, task_y_val,
        X_test_processed, task_y_test,
        model_spec['model'], 
        model_name_prefix=f"_{model_spec['name']}", 
        task_type=model_spec['task'],
        param_grid=model_spec['grid'], 
        cv_splitter=cv_splitter if model_spec['grid'] else None, 
        primary_scoring_metric=primary_metric if model_spec['grid'] else None, 
        feature_names_processed=processed_feature_names, 
        random_state_seed=seed
    )
    final_results_list.append(result)

print("\nAll modeling pipelines completed.")

In [None]:
# --- Part 11: Display Final Results Table ---
print("\n--- Part 11: Display Final Results Table ---")

if final_results_list:
    valid_results = [res for res in final_results_list if isinstance(res, dict) and 'Model' in res]
    
    if valid_results:
        results_df = pd.DataFrame(valid_results)
        
        cols_order = ['Model', 'Task', 'CV_Score', 'Accuracy', 'F1-Weighted', 'Precision-W', 'Recall-W', 'ROC_AUC', 
                      'RMSE', 'MAE', 'R2', 'Best_Params', 'Importances', 'ModelObject']
        
        results_df_ordered = pd.DataFrame(columns=[col for col in cols_order if col in results_df.columns])
        for col in results_df_ordered.columns: 
            if col in results_df.columns:
                results_df_ordered[col] = results_df[col]
        
        for col in results_df.columns:
            if col not in results_df_ordered.columns:
                results_df_ordered[col] = results_df[col]
        results_df = results_df_ordered 

        sort_by_cols = ['Task']
        sort_ascending = [True]
        
        if 'F1-Weighted' in results_df.columns and results_df['F1-Weighted'].notna().any():
            sort_by_cols.append('F1-Weighted')
            sort_ascending.append(False) 
        if 'RMSE' in results_df.columns and results_df['RMSE'].notna().any(): 
            sort_by_cols.append('RMSE')
            sort_ascending.append(True) 
        
        sort_by_cols_existing = [col for col in sort_by_cols if col in results_df.columns]
        sort_ascending_existing = [asc for col, asc in zip(sort_by_cols, sort_ascending) if col in results_df.columns]

        if sort_by_cols_existing: 
             results_df = results_df.sort_values(by=sort_by_cols_existing, ascending=sort_ascending_existing).reset_index(drop=True)
        
        print("\nModel Performance Summary:")
        with pd.option_context('display.max_colwidth', 100): 
            display(results_df.drop(columns=['ModelObject'], errors='ignore')) 

        results_csv_path = os.path.join(RESULTS_DIR, 'all_models_full_dataset_summary.csv')
        results_df.to_csv(results_csv_path, index=False)
        print(f"\nResults summary saved to: {results_csv_path}")

        best_clf_model = results_df[(results_df['Task'] == 'classification') & (results_df['F1-Weighted'] == results_df['F1-Weighted'].max())]['ModelObject'].iloc[0]
        best_reg_model = results_df[(results_df['Task'] == 'regression') & (results_df['RMSE'] == results_df['RMSE'].min())]['ModelObject'].iloc[0]
        
        joblib.dump(best_clf_model, os.path.join(RESULTS_DIR, 'best_classification_model.pkl'))
        joblib.dump(best_reg_model, os.path.join(RESULTS_DIR, 'best_regression_model.pkl'))
        print(f"Best classification model saved to: {os.path.join(RESULTS_DIR, 'best_classification_model.pkl')}")
        print(f"Best regression model saved to: {os.path.join(RESULTS_DIR, 'best_regression_model.pkl')}")

    else:
        print("No valid model results to display or save after filtering.")
else:
    print("No model results generated.")

print("\n--- Model Training & Evaluation Complete ---")

In [None]:
# --- Part 12: Feature Importance Analysis Helpers ---
print("\n--- Part 12: Feature Importance Analysis Helpers ---")

def plot_feature_importances(importance_df, model_name_title, top_n=15, plot_dir=None):
    if importance_df is None or importance_df.empty or not isinstance(importance_df, pd.DataFrame):
        print(f"No valid feature importances DataFrame available for {model_name_title}.")
        return
    
    top_features = importance_df.head(top_n)
    plt.figure(figsize=(10, max(6, top_n * 0.45))) 
    sns.barplot(x='importance', y='feature', data=top_features, palette='viridis')
    plt.title(f'Top {top_n} Feature Importances for {model_name_title}', fontsize=15)
    plt.xlabel('Importance Score', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.yticks(fontsize=10)
    plt.xticks(fontsize=10)
    plt.tight_layout()
    if plot_dir: 
        try:
            safe_model_name = "".join(c if c.isalnum() else "_" for c in model_name_title) 
            plot_path = os.path.join(plot_dir, f'feature_importances_{safe_model_name}.png')
            plt.savefig(plot_path)
            print(f"Plot saved to: {plot_path}")
        except Exception as e:
            print(f"Error saving plot for {model_name_title}: {e}")
    plt.show()

def analyze_model_importances(model_name_to_find, task_type_filter, results_list, top_n_display=15, plots_directory=None):
    model_result = None
    if not results_list: 
        print(f"Warning: Results list is empty for {model_name_to_find}.")
        return

    for res_dict in results_list:
        if isinstance(res_dict, dict) and \
           res_dict.get('Task') == task_type_filter and \
           model_name_to_find == res_dict.get('Model'): 
            model_result = res_dict
            break
    
    if model_result:
        model_full_name = model_result.get('Model', model_name_to_find) 
        importances_data = model_result.get('Importances')

        if isinstance(importances_data, pd.DataFrame) and not importances_data.empty:
            print(f"\n--- Feature Importances for: {model_full_name} ---")
            display(importances_data.head(top_n_display))
            plot_feature_importances(importances_data, model_full_name, top_n=top_n_display, plot_dir=plots_directory)
        elif isinstance(importances_data, str) and importances_data == "Not Available":
            print(f"\nFeature importances 'Not Available' for {model_full_name}.")
        else:
            print(f"\nNo valid feature importances DataFrame for {model_full_name}.")
    else:
        print(f"\nCould not find results for '{model_name_to_find}'.")

print("Feature importance helper functions defined.")

In [None]:
# --- Part 13: Feature Importance Analysis Execution ---
print("\n--- Part 13: Executing Feature Importance Analysis ---")

analyze_model_importances(
    model_name_to_find="XGBClassifier_XGBoost", 
    task_type_filter='classification',
    results_list=final_results_list, 
    plots_directory=PLOTS_DIR        
)

analyze_model_importances(
    model_name_to_find="XGBRegressor_XGBoost_Reg",
    task_type_filter='regression',
    results_list=final_results_list,
    plots_directory=PLOTS_DIR
)
    
print("\n--- Feature Importance Analysis Complete ---")