# 02 - Training Pipeline (End-to-End)


In [1]:
# Setup: ensure dependencies are installed (skip on Kaggle if already present)
import sys, subprocess
req = '../requirements.txt'
try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', req])
except Exception as e:
    print('Install skipped or failed:', e)


Install skipped or failed: Command '['g:\\anaconda3\\envs\\DS_DA\\python.exe', '-m', 'pip', 'install', '-q', '-r', '../requirements.txt']' returned non-zero exit status 1.


In [2]:
import os
import json
from pathlib import Path
import pandas as pd
import yaml

PROJECT_ROOT = Path('..')
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'baseline_lgbm.yaml'
TRAIN_PATH = PROJECT_ROOT / 'data' / 'train.csv'
TEST_PATH = PROJECT_ROOT / 'data' / 'test.csv'

with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)
config


{'paths': {'train_csv': 'data/train.csv',
  'test_csv': 'data/test.csv',
  'model_output_dir': 'models/baseline_lgbm',
  'encoders_dir': 'models/baseline_lgbm/encoders',
  'vectorizer_prefix': 'models/baseline_lgbm/tfidf_vectorizer_fold',
  'feature_columns_path': 'models/baseline_lgbm/feature_columns.json',
  'label_mapping_path': 'models/baseline_lgbm/label_mapping.json',
  'oof_predictions_path': 'models/baseline_lgbm/oof_predictions.csv',
  'submission_path': 'submissions/submission.csv'},
 'columns': {'id': 'ID',
  'target': 'Target',
  'group': 'user_id',
  'datetime_columns': ['time_observed', 'prediction_time'],
  'text_columns': ['indicator_description', 'indicator'],
  'categorical_columns': ['user_id', 'community', 'district', 'indicator'],
  'numerical_columns': ['confidence']},
 'preprocessing': {'datetime_format': 'infer',
  'timezone': None,
  'text_clean': {'lowercase': True,
   'strip_punctuation': True,
   'normalize_whitespace': True,
   'fillna_token': '<missing>'}}

In [3]:
# Data Loading & Preprocessing
import sys
sys.path.append(str(PROJECT_ROOT / 'src'))

from preprocess import load_and_clean_data, save_label_mapping

train_df, label_map = load_and_clean_data(str(TRAIN_PATH), config)
test_df, _ = load_and_clean_data(str(TEST_PATH), config)

# Save label mapping for inference
label_map_path = (PROJECT_ROOT / config['paths']['label_mapping_path'])
label_map_path.parent.mkdir(parents=True, exist_ok=True)
with open(label_map_path, 'w') as f:
    json.dump(label_map, f, indent=2)
train_df.head()


  df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)


Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,Target,forecast_length,Target_label
0,ID_KwcTp_12,11,0.3,0.0,Tumfa,atiwa_west,2025-05-30 11:09:33+00:00,<missing>,<missing>,NaT,MEDIUMRAIN,12,1
1,ID_K9vWT_12,17,0.3,0.0,Kwabeng,atiwa_west,2025-05-30 11:09:35+00:00,<missing>,<missing>,NaT,HEAVYRAIN,12,0
2,ID_AIQg3_12,19,0.3,0.0,Akropong,atiwa_west,2025-05-30 11:09:47+00:00,<missing>,<missing>,NaT,MEDIUMRAIN,12,1
3,ID_px4yf_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:33+00:00,<missing>,<missing>,NaT,HEAVYRAIN,12,0
4,ID_QYYmK_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:55+00:00,<missing>,<missing>,NaT,HEAVYRAIN,12,0


In [6]:
# Feature Engineering + Training (CV)
from train import train_model

oof_score, oof_df = train_model(train_df, config)
print('OOF Macro F1:', oof_score)
oof_df.head()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 9457, number of used features: 25
[LightGBM] [Info] Start training from score -3.401938
[LightGBM] [Info] Start training from score -2.521192
[LightGBM] [Info] Start training from score -0.148369
[LightGBM] [Info] Start training from score -3.720788
[100]	valid_0's multi_logloss: 0.178076
Fold 0 Macro F1: 0.33197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 10176, number of used features: 17
[LightGBM] [Info

Unnamed: 0,ID,Target_label,pred_label
0,ID_KwcTp_12,1,2
1,ID_K9vWT_12,0,2
2,ID_AIQg3_12,1,2
3,ID_px4yf_12,0,2
4,ID_QYYmK_12,0,2


In [7]:
# Inference on test and submission generation
from predict import generate_predictions

submission = generate_predictions(test_df, config['paths']['model_output_dir'], config)
submission.head()


Unnamed: 0,ID,Target
0,ID_SbTdy_24,2
1,ID_SBKYz_24,2
2,ID_fAimg_24,2
3,ID_2wBqC_24,2
4,ID_NItox_24,2


## Explainability (SHAP)


In [8]:
# Demonstration with one fold model
import joblib
import shap
from scipy import sparse
from features import create_features, transform_tfidf

model_dir = PROJECT_ROOT / config['paths']['model_output_dir']
model_path = next(model_dir.glob('model_fold_*.pkl'))
clf = joblib.load(model_path)
fold = int(model_path.stem.split('_')[-1])

# Build a small sample feature matrix for SHAP
sample = test_df.sample(100, random_state=42) if len(test_df) > 100 else test_df.copy()
sample_feat, art, feat_cols = create_features(sample, config=config, is_train=False, fold=fold)
tfidf_cfg = config['features'].get('tfidf', {})
tfidf_cols = tfidf_cfg.get('use_text_columns', [])
tfidf_prefix = config['paths'].get('vectorizer_prefix')
if len(tfidf_cols) > 0 and tfidf_prefix:
    vectorizer_path = PROJECT_ROOT / f"{tfidf_prefix}_{fold}.pkl"
    tfidf_matrix = transform_tfidf(sample_feat[tfidf_cols[0]], str(vectorizer_path))
else:
    tfidf_matrix = None

from scipy import sparse as sp
X_sample = sp.hstack([sp.csr_matrix(sample_feat[feat_cols].astype(float).fillna(0.0).values), tfidf_matrix], format='csr') if tfidf_matrix is not None else sp.csr_matrix(sample_feat[feat_cols].astype(float).fillna(0.0).values)
explainer = shap.TreeExplainer(clf.booster_) if hasattr(clf, 'booster_') else shap.Explainer(clf)
shap_values = explainer(X_sample[:50])
shap.plots.beeswarm(shap_values, max_display=15)


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'matplotlib.colorbar'