# 02 - Training Pipeline (End-to-End)


In [None]:
# Setup: ensure dependencies are installed (skip on Kaggle if already present)
import sys, subprocess
req = '../requirements.txt'
try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', req])
except Exception as e:
    print('Install skipped or failed:', e)


In [None]:
import os
import json
from pathlib import Path
import pandas as pd
import yaml

PROJECT_ROOT = Path('..')
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'baseline_lgbm.yaml'
TRAIN_PATH = PROJECT_ROOT / 'data' / 'train.csv'
TEST_PATH = PROJECT_ROOT / 'data' / 'test.csv'

with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)
config


In [None]:
# Data Loading & Preprocessing
import sys
sys.path.append(str(PROJECT_ROOT / 'src'))

from preprocess import load_and_clean_data, save_label_mapping

train_df, label_map = load_and_clean_data(str(TRAIN_PATH), config)
test_df, _ = load_and_clean_data(str(TEST_PATH), config)

# Save label mapping for inference
label_map_path = (PROJECT_ROOT / config['paths']['label_mapping_path'])
label_map_path.parent.mkdir(parents=True, exist_ok=True)
with open(label_map_path, 'w') as f:
    json.dump(label_map, f, indent=2)
train_df.head()


In [None]:
# Feature Engineering + Training (CV)
from train import train_model

oof_score, oof_df = train_model(train_df, config)
print('OOF Macro F1:', oof_score)
oof_df.head()


In [None]:
# Inference on test and submission generation
from predict import generate_predictions

submission = generate_predictions(test_df, config['paths']['model_output_dir'], config)
submission.head()


## Explainability (SHAP)


In [None]:
# Demonstration with one fold model
import joblib
import shap
from scipy import sparse
from features import create_features, transform_tfidf

model_dir = PROJECT_ROOT / config['paths']['model_output_dir']
model_path = next(model_dir.glob('model_fold_*.pkl'))
clf = joblib.load(model_path)
fold = int(model_path.stem.split('_')[-1])

# Build a small sample feature matrix for SHAP
sample = test_df.sample(100, random_state=42) if len(test_df) > 100 else test_df.copy()
sample_feat, art, feat_cols = create_features(sample, config=config, is_train=False, fold=fold)
tfidf_cfg = config['features'].get('tfidf', {})
tfidf_cols = tfidf_cfg.get('use_text_columns', [])
tfidf_prefix = config['paths'].get('vectorizer_prefix')
if len(tfidf_cols) > 0 and tfidf_prefix:
    vectorizer_path = PROJECT_ROOT / f"{tfidf_prefix}_{fold}.pkl"
    tfidf_matrix = transform_tfidf(sample_feat[tfidf_cols[0]], str(vectorizer_path))
else:
    tfidf_matrix = None

from scipy import sparse as sp
X_sample = sp.hstack([sp.csr_matrix(sample_feat[feat_cols].astype(float).fillna(0.0).values), tfidf_matrix], format='csr') if tfidf_matrix is not None else sp.csr_matrix(sample_feat[feat_cols].astype(float).fillna(0.0).values)
explainer = shap.TreeExplainer(clf.booster_) if hasattr(clf, 'booster_') else shap.Explainer(clf)
shap_values = explainer(X_sample[:50])
shap.plots.beeswarm(shap_values, max_display=15)
