In [1]:
import pyarrow.parquet as pq
import pandas as pd

# Read parquet file
table = pq.read_table("train_data (1).parquet")

# Convert to pandas DataFrame
df_train = table.to_pandas()

print(df_train.head())


                                               id1      id2        id3  \
0  1366776_189706075_16-23_2023-11-02 22:22:00.042  1366776  189706075   
1      1366776_89227_16-23_2023-11-01 23:51:24.999  1366776      89227   
2      1366776_35046_16-23_2023-11-01 00:30:59.797  1366776      35046   
3    1366776_6275451_16-23_2023-11-02 22:21:32.261  1366776    6275451   
4      1366776_78053_16-23_2023-11-02 22:21:34.799  1366776      78053   

                       id4         id5  y   f1    f2    f3    f4  ...  f357  \
0  2023-11-02 22:22:00.042  2023-11-02  0  1.0  None  None  None  ...  None   
1  2023-11-01 23:51:24.999  2023-11-01  0  1.0  None  None  None  ...  None   
2  2023-11-01 00:30:59.797  2023-11-01  0  1.0  None  None  None  ...  None   
3  2023-11-02 22:21:32.261  2023-11-02  0  1.0  None  None  None  ...  None   
4  2023-11-02 22:21:34.799  2023-11-02  0  1.0  None  None  None  ...  None   

      f358 f359  f360   f361 f362               f363    f364 f365  \
0  -9999.0 

In [2]:
df_test = pd.read_parquet(r"C:\Users\gourh\OneDrive\Desktop\esc\AMEX\test_data.parquet")

print(df_test.head)

<bound method NDFrame.head of                                                     id1      id2     id3  \
46756    1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907   91950   
57819       1082599_88356_16-23_2023-11-04 06:08:53.373  1082599   88356   
15390   1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466  958700   
145730     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971  795739   
146085      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369   82296   
...                                                 ...      ...     ...   
1806        1874443_95537_16-23_2023-11-05 09:21:24.182  1874443   95537   
127494       1541978_5718_16-23_2023-11-05 00:56:43.946  1541978    5718   
106947      1887841_85905_16-23_2023-11-05 20:40:43.312  1887841   85905   
158372     1569367_944713_16-23_2023-11-05 00:43:04.335  1569367  944713   
74378       1086547_60142_16-23_2023-11-05 10:37:36.747  1086547   60142   

                               id4         id5    f1    f

In [3]:
import pandas as pd
import numpy as np

# ── PARAMETERS ──────────────────────────────────────────────
TARGET          = "y"        # dependent variable
NULL_LIMIT_COL  = 0.90       # drop column if ≥ 90 % nulls
MIN_VARIETY_COL = 1          # drop column if ≤ 1 unique value
NULL_LIMIT_ROW  = 0.50       # drop row if ≥ 50 % nulls (only for TRAIN)
# ────────────────────────────────────────────────────────────

def clean_dataframes(df_train: pd.DataFrame,
                     df_test: pd.DataFrame,
                     target: str = TARGET,
                     null_limit_col: float = NULL_LIMIT_COL,
                     min_variety_col: int = MIN_VARIETY_COL,
                     null_limit_row: float = NULL_LIMIT_ROW):
    """
    Cleans train/test dataframes by:
    1. Dropping constant or mostly-null columns (based on train only).
    2. Dropping train rows with many nulls.
    3. Aligning test columns (no row drop).
    Returns: (df_train_final, df_test_final, dropped_cols, dropped_rows_count)
    """
    # 1️⃣ Drop constant or mostly-null columns (from TRAIN only)
    const_cols = [c for c in df_train.columns if df_train[c].nunique(dropna=False) <= min_variety_col]
    null_cols  = [c for c in df_train.columns if df_train[c].isna().mean() >= null_limit_col]
    to_drop = list({*const_cols, *null_cols} - {target})

    # 2️⃣ Drop from both datasets
    df_train_clean = df_train.drop(columns=to_drop, errors="ignore")
    df_test_clean  = df_test.drop(columns=to_drop, errors="ignore")

    # 3️⃣ Drop TRAIN rows with many nulls (excluding target)
    feature_cols = [c for c in df_train_clean.columns if c != target]
    null_fraction_per_row = df_train_clean[feature_cols].isna().mean(axis=1)
    bad_rows = null_fraction_per_row >= null_limit_row
    df_train_final = df_train_clean.loc[~bad_rows].copy()
    dropped_rows = bad_rows.sum()

    # 4️⃣ Align TEST columns exactly with TRAIN features
    df_test_final = df_test_clean.reindex(columns=feature_cols)

    return df_train_final, df_test_final, to_drop, dropped_rows

# ── RUN CLEANING ────────────────────────────────────────────
df_train_final, df_test_final, dropped_cols, dropped_rows = clean_dataframes(df_train, df_test)

# ── REPORT ──────────────────────────────────────────────────
print("🔹 Columns dropped:", len(dropped_cols))
print("🔹 Rows dropped from train:", dropped_rows)
print("📐 Shape overview:")
print(" • Original train:", df_train.shape)
print(" • Cleaned  train:", df_train_final.shape)
print(" • Cleaned  test :", df_test_final.shape)


🔹 Columns dropped: 38
🔹 Rows dropped from train: 4505
📐 Shape overview:
 • Original train: (770164, 372)
 • Cleaned  train: (765659, 334)
 • Cleaned  test : (369301, 333)


In [5]:
# ╭────────────────────────────────────────────────────────╮
# │  FASTEST TABULAR PIPELINE (CPU “hist” or GPU “gpu_hist”)│
# ╰────────────────────────────────────────────────────────╯

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import scipy.sparse as sp

# ── PARAMETERS ────────────────────────────────────────────
TARGET          = "y"
TEST_SIZE       = 0.20
RANDOM_STATE    = 42
TREE_METHOD     = "hist"    # ← change to "gpu_hist" if you have a CUDA GPU
EARLY_STOP_ROUNDS = 25      # stop if no val‑improve for 25 rounds
N_ESTIMATORS    = 1000      # large cap; early‑stopping will finish sooner
# ──────────────────────────────────────────────────────────

# 1️⃣  Split         ───────────────────────────────────────
X_df = df_train_final.drop(columns=TARGET)
y_raw = df_train_final[TARGET]

# label → int
if y_raw.dtype.kind in {"i", "u"}:
    y_all = y_raw.values
else:
    try:
        y_all = y_raw.astype(int).values
    except ValueError:
        le = LabelEncoder()
        y_all = le.fit_transform(y_raw)

X_train_df, X_val_df, y_train, y_val = train_test_split(
    X_df, y_all,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_all
)

# 2️⃣  Preprocess once ─────────────────────────────────────
cat_cols = [c for c in X_df.columns if X_df[c].dtype == "object"]
num_cols = [c for c in X_df.columns if c not in cat_cols]

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc",  StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
], sparse_threshold=0.3)           # ensure sparse output for speed

X_train = preprocess.fit_transform(X_train_df)
X_val   = preprocess.transform(X_val_df)
X_test  = preprocess.transform(df_test_final[X_df.columns])

# 3️⃣  XGBoost training ────────────────────────────────────
xgb_clf = XGBClassifier(
    n_estimators=N_ESTIMATORS,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method=TREE_METHOD,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    reg_lambda=1.0,
    eval_metric="logloss"              # ✅ moved here
)

xgb_clf.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=EARLY_STOP_ROUNDS,
    verbose=50         # print progress every 50 rounds
)

print(f"\nBest iteration: {xgb_clf.best_iteration}")

# 4️⃣  Validation metrics ──────────────────────────────────
y_pred_val = xgb_clf.predict(X_val)
print("\nValidation accuracy:", f"{accuracy_score(y_val, y_pred_val):.4%}")
print(classification_report(y_val, y_pred_val))

# 5️⃣  Retrain on full data at best_iteration ──────────────
best_n = xgb_clf.best_iteration or N_ESTIMATORS
xgb_final = XGBClassifier(
    n_estimators=best_n,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method=TREE_METHOD,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    reg_lambda=1.0
)
# Reuse already‑fitted preprocess → avoid refitting OneHot
X_all = sp.vstack([X_train, X_val])
y_all = np.concatenate([y_train, y_val])
xgb_final.fit(X_all, y_all)

# 6️⃣  Predict test ────────────────────────────────────────
test_pred = xgb_final.predict(X_test)
if 'le' in locals():
    test_pred = le.inverse_transform(test_pred)

pred_df = pd.DataFrame({
    "row_id": df_test_final.index,   # replace with actual ID if present
    "y_pred": test_pred
})
pred_df.to_csv("test_predictions.csv", index=False)
print("\n✅  Saved test_predictions.csv with", len(pred_df), "rows.")


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'