In [None]:
import os, gc, random, inspect
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import xgboost as xgb

In [None]:

# =========================================================
# Submission-only pipeline for Kaggle:
# - Clean labels
# - Robust Feature Engineering (fit on train, apply to test)
# - Version-agnostic sparse OneHot
# - XGBoost (DMatrix API) with early stopping on a small validation slice
# - Predict ONLY test.csv and save 'submission.csv' as id,exam_score
# =========================================================



# ---------------------------
# Reproducibility
# ---------------------------
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except Exception:
        pass

seed_everything(42)

# ---------------------------
# Utilities
# ---------------------------
def stratify_bins(y, n_bins=10):
    y = pd.Series(y).astype(float).clip(lower=0)
    try:
        bins = pd.qcut(y, q=n_bins, labels=False, duplicates='drop')
    except Exception:
        bins = pd.cut(y, bins=n_bins, labels=False)
    return bins.fillna(0).astype(int)

def gpu_available():
    return os.path.exists("/proc/driver/nvidia/version")

def save_submission_strict(df_test, id_col, preds, path="submission.csv", clip_min=0.0, clip_max=None):
    preds = np.asarray(preds, dtype=np.float64)
    # ensure length & order exactly match df_test
    assert len(preds) == len(df_test), f"Preds ({len(preds)}) != test rows ({len(df_test)})"

    # replace accidental NaNs with mean
    if np.isnan(preds).any():
        preds = np.where(np.isnan(preds), np.nanmean(preds), preds)

    if clip_min is not None:
        preds = np.maximum(preds, clip_min)
    if clip_max is not None:
        preds = np.minimum(preds, clip_max)

    sub = pd.DataFrame({"id": df_test[id_col].values, "exam_score": preds})
    # hard checks
    assert sub.shape[0] == df_test.shape[0], "Row count mismatch!"
    assert np.array_equal(sub["id"].values, df_test[id_col].values), "ID order mismatch!"

    sub.to_csv(path, index=False)
    print(f"âœ… Saved {path} with {len(sub)} rows.")
    print(sub.head())

# ---------------------------
# Robust Feature Engineering (fit/transform)
# ---------------------------
class RobustAcademicFE:
    def __init__(self, clip_quantiles=(0.01, 0.99)):
        self.clip_quantiles = clip_quantiles
        self._num_clip_bounds = {}
        self._drop_all_nan_cols = []
        self.sleep_map = {"very poor":1,"poor":2,"fair":3,"average":3,"okay":3,"good":4,"very good":5,"excellent":6}
        self.exam_diff_map = {"very easy":1,"easy":2,"medium":3,"hard":4,"very hard":5}
        self.facility_map  = {"very bad":1,"bad":2,"average":3,"good":4,"very good":5,"excellent":6}
        self._synonyms = {"ok":"okay","vg":"very good","v good":"very good","v bad":"very bad"}

    def _n(self, x):
        if pd.isna(x): return np.nan
        return str(x).strip().lower()

    def _map_ordinal(self, s, base_map, numeric_ok=True):
        def conv(v):
            vn = self._n(v)
            if vn is np.nan: return np.nan
            if vn in self._synonyms: vn = self._synonyms[vn]
            if numeric_ok:
                try: return float(vn)
                except Exception: pass
            return base_map.get(vn, np.nan)
        return s.map(conv)

    def _engineer(self, df):
        df = df.copy()
        # Ordinals
        if "sleep_quality" in df:
            df["sleep_quality_ord"] = self._map_ordinal(df["sleep_quality"], self.sleep_map)
        if "exam_difficulty" in df:
            df["exam_difficulty_ord"] = self._map_ordinal(df["exam_difficulty"], self.exam_diff_map)
        if "facility_rating" in df:
            df["facility_rating_ord"] = self._map_ordinal(df["facility_rating"], self.facility_map)
        # Interactions
        if "study_hours" in df and "class_attendance" in df:
            df["study_x_attendance"] = df["study_hours"] * df["class_attendance"]
        if "sleep_hours" in df and "sleep_quality_ord" in df:
            df["sleep_effective"] = df["sleep_hours"] * df["sleep_quality_ord"]
        # Internet flag
        if "internet_access" in df:
            df["has_internet"] = df["internet_access"].map(
                lambda v: 0 if self._n(v) in ["no","none","null","nan",""] else 1
            )
        # Missingness flags
        for col in ["study_hours","class_attendance","sleep_hours"]:
            if col in df:
                df[f"isna_{col}"] = df[col].isna().astype(int)
        return df

    def fit(self, X, y=None):
        Xb = self._engineer(X)
        num_cols = Xb.select_dtypes(include=["number"]).columns
        self._drop_all_nan_cols = [c for c in num_cols if Xb[c].notna().sum() == 0]
        # learn clipping on key numerics
        ql, qh = self.clip_quantiles
        for col in ["study_hours","class_attendance","sleep_hours"]:
            if col in Xb:
                low, high = Xb[col].quantile([ql, qh])
                self._num_clip_bounds[col] = (float(low), float(high))
        return self

    def transform(self, X):
        Xb = self._engineer(X)
        Xb = Xb.drop(columns=self._drop_all_nan_cols, errors="ignore")
        for col, (low, high) in self._num_clip_bounds.items():
            if col in Xb:
                Xb[col] = Xb[col].clip(lower=low, upper=high)
        return Xb

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

# ---------------------------
# Load data
# ---------------------------
train_path = "train.csv"
test_path  = "test.csv"
assert os.path.exists(train_path) and os.path.exists(test_path), "Put train.csv and test.csv in the working directory."

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

TARGET = "exam_score"
ID_COL = "id"
EXCLUDE = {TARGET, ID_COL, "Transported"}  # exclude id, target, unrelated columns

assert TARGET in df_train.columns, f"{TARGET} missing in train.csv"
assert ID_COL in df_test.columns,  f"{ID_COL} missing in test.csv"


df_train[TARGET] = pd.to_numeric(df_train[TARGET], errors="coerce")
mask = df_train[TARGET].notna() & np.isfinite(df_train[TARGET].values) & (df_train[TARGET].abs() < 1e9)
dropped = (~mask).sum()
if dropped > 0:
    print(f"[Label clean] Dropped {dropped} train rows with NaN/Inf/unreasonable {TARGET}.")
df_train = df_train.loc[mask].reset_index(drop=True)


feature_cols = [c for c in df_train.columns if c not in EXCLUDE]
X_full_raw = df_train[feature_cols].copy()
y_full     = df_train[TARGET].astype(float).values
X_test_raw = df_test[feature_cols].copy()


fe = RobustAcademicFE()
X_full_fe = fe.fit_transform(X_full_raw)
X_test_fe = fe.transform(X_test_raw)


common_cols = sorted(set(X_full_fe.columns) & set(X_test_fe.columns))
X_full_fe = X_full_fe[common_cols].copy()
X_test_fe = X_test_fe[common_cols].copy()

print(f"Train shape (after FE): {X_full_fe.shape}, Test shape (after FE): {X_test_fe.shape}")


def make_ohe():
    kwargs = {"handle_unknown":"ignore"}
    if "sparse_output" in inspect.signature(OneHotEncoder).parameters:
        kwargs["sparse_output"] = True   # sklearn >= 1.2
    else:
        kwargs["sparse"] = True          # sklearn < 1.2
    return OneHotEncoder(**kwargs)

num_cols = X_full_fe.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_full_fe.select_dtypes(include=["object","category","bool"]).columns.tolist()

pre = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", make_ohe())
        ]), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)


bins = stratify_bins(y_full, n_bins=10)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
tr_idx, va_idx = next(sss.split(X_full_fe, bins))

X_tr, X_va = X_full_fe.iloc[tr_idx], X_full_fe.iloc[va_idx]
y_tr, y_va = y_full[tr_idx], y_full[va_idx]


X_tr_enc = pre.fit_transform(X_tr)
X_va_enc = pre.transform(X_va)
X_te_enc = pre.transform(X_test_fe)

use_gpu = gpu_available()
params = {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "predictor": "gpu_predictor" if use_gpu else "auto",
    "learning_rate": 0.03,
    "max_depth": 6,
    "min_child_weight": 2,
    "subsample": 0.85,
    "colsample_bytree": 0.75,
    "lambda": 1.0,
    "random_state": 42,
    "verbosity": 0
}
NUM_ROUNDS = 3500
ES_ROUNDS  = 200

dtr = xgb.DMatrix(X_tr_enc, label=y_tr)
dva = xgb.DMatrix(X_va_enc, label=y_va)
dte = xgb.DMatrix(X_te_enc)

evals = [(dtr, "train"), (dva, "valid")]
bst = xgb.train(
    params=params,
    dtrain=dtr,
    num_boost_round=NUM_ROUNDS,
    evals=evals,
    early_stopping_rounds=ES_ROUNDS,
    verbose_eval=False
)

# Predict test ONLY (for submission)
try:
    test_pred = bst.predict(dte, iteration_range=(0, bst.best_iteration + 1))
except Exception:
    test_pred = bst.predict(dte, ntree_limit=bst.best_ntree_limit)


CLIP_MIN, CLIP_MAX = 0.0, None  # set CLIP_MAX=100.0 if scores are bounded above
save_submission_strict(df_test, ID_COL, test_pred, path="submission.csv", clip_min=CLIP_MIN, clip_max=CLIP_MAX)

gc.collect()
