In [None]:
# Upgrade lightgbm for CUDA support (4.0+ has it built-in)
!pip install -q --upgrade lightgbm
!pip install -q catboost optuna

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from scipy.stats import rankdata
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# Check GPU is available
# If this shows 'No GPU' go to Runtime -> Change runtime type -> GPU
import subprocess
result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total',
                         '--format=csv,noheader'], capture_output=True, text=True)
if result.returncode == 0:
    print(f'GPU detected: {result.stdout.strip()}')
    DEVICE = 'gpu'
else:
    print('No GPU found â€” go to Runtime -> Change runtime type -> T4 GPU')
    DEVICE = 'cpu'

In [None]:
# --- Option A: Upload files manually ---
# from google.colab import files
# files.upload()  # upload train.csv and test.csv

# --- Option B: Kaggle API (recommended) ---
# 1. Go to kaggle.com -> Settings -> API -> Create New Token -> downloads kaggle.json
# 2. Run this cell
from google.colab import files
files.upload()  # upload your kaggle.json here

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c valentine-hackathon-2026
!unzip -q valentine-hackathon-2026.zip

In [None]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
print(f'Train: {train.shape} | Test: {test.shape}')

In [None]:
def engineer_features(df):
    df = df.copy()

    df['Survey_Date']      = pd.to_datetime(df['Survey_Date'], errors='coerce')
    df['Survey_Month']     = df['Survey_Date'].dt.month
    df['Survey_Hour']      = df['Survey_Date'].dt.hour
    df['Survey_DayOfWeek'] = df['Survey_Date'].dt.dayofweek

    df['BMI']            = df['Weight_kg'] / ((df['Height_cm'] / 100) ** 2)
    df['Log_Income']     = np.log1p(df['Income'])
    df['Income_per_Age'] = df['Income'] / (df['Age'] + 1)

    df['Appear_x_Social']   = df['Appearance_Score']   * df['Social_Skills_Score']
    df['Dating_x_Extra']    = df['Dating_App_User']    * df['Extraversion_Score']
    df['Extra_x_Social']    = df['Extraversion_Score'] * df['Social_Skills_Score']
    df['Dating_x_Prev_Rel'] = df['Dating_App_User']    * df['Previous_Relationships']

    df['Social_Composite'] = (
        df['Social_Skills_Score'].fillna(0) + df['Extraversion_Score'].fillna(0)
    ) / 2

    for col in ['Pets', 'Favorite_Color', 'Social_Media_Presence',
                'Zodiac_Sign', 'Social_Skills_Score', 'Appearance_Score']:
        df[f'{col}_missing'] = df[col].isnull().astype(int)

    df['n_missing'] = df.isnull().sum(axis=1)
    return df

train = engineer_features(train)
test  = engineer_features(test)

In [None]:
target_encode_cols = ['Zodiac_Sign', 'Pets', 'Favorite_Color', 'Social_Media_Presence',
                      'Gender', 'Education', 'Job_Type', 'Location_Type']

skf_te      = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
global_mean = train['Has_Valentine'].mean()

for col in target_encode_cols:
    train[f'{col}_te'] = np.nan
    for tr_idx, val_idx in skf_te.split(train, train['Has_Valentine']):
        col_means = train.iloc[tr_idx].groupby(col)['Has_Valentine'].mean()
        train.loc[train.index[val_idx], f'{col}_te'] = (
            train.iloc[val_idx][col].map(col_means).fillna(global_mean)
        )
    col_means_full    = train.groupby(col)['Has_Valentine'].mean()
    test[f'{col}_te'] = test[col].map(col_means_full).fillna(global_mean)

In [None]:
cat_cols = ['Gender', 'Education', 'Job_Type', 'Social_Media_Presence',
            'Location_Type', 'Zodiac_Sign', 'Pets', 'Favorite_Color']

for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col]  = le.transform(test[col].astype(str))

In [None]:
drop_cols    = ['Id', 'Has_Valentine', 'Survey_Date']
feature_cols = [c for c in train.columns if c not in drop_cols]

X        = train[feature_cols]
y        = train['Has_Valentine']
X_test   = test[feature_cols]
test_ids = test['Id']
cv       = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tune_train_idx, tune_val_idx = next(cv.split(X, y))
X_tr, X_val = X.iloc[tune_train_idx], X.iloc[tune_val_idx]
y_tr, y_val = y.iloc[tune_train_idx], y.iloc[tune_val_idx]

print(f'Features: {len(feature_cols)}')

In [None]:
def lgbm_objective(trial):
    params = {
        'num_leaves':        trial.suggest_int('num_leaves', 31, 256),
        'learning_rate':     trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'feature_fraction':  trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq':      trial.suggest_int('bagging_freq', 1, 7),
        'reg_lambda':        trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'reg_alpha':         trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'n_estimators': 1000, 'random_state': 42, 'verbose': -1,
        'device': DEVICE,
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(50, verbose=False)])
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(lgbm_objective, n_trials=50, show_progress_bar=True)
print(f'LGBM Best AUC: {lgbm_study.best_value:.4f}')

In [None]:
def xgb_objective(trial):
    params = {
        'max_depth':        trial.suggest_int('max_depth', 3, 10),
        'learning_rate':    trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample':        trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'reg_alpha':        trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda':       trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'gamma':            trial.suggest_float('gamma', 0.0, 1.0),
        'n_estimators': 1000, 'random_state': 42, 'verbosity': 0,
        'tree_method': 'hist', 'device': 'cuda' if DEVICE == 'gpu' else 'cpu',
        'early_stopping_rounds': 50, 'eval_metric': 'auc',
    }
    model = xgb.XGBClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(xgb_objective, n_trials=50, show_progress_bar=True)
print(f'XGB  Best AUC: {xgb_study.best_value:.4f}')

In [None]:
def cat_objective(trial):
    params = {
        'depth':               trial.suggest_int('depth', 4, 10),
        'learning_rate':       trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_leaf_reg':         trial.suggest_float('l2_leaf_reg', 1.0, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength':     trial.suggest_float('random_strength', 0.0, 1.0),
        'iterations': 1000, 'random_seed': 42,
        'verbose': 0, 'early_stopping_rounds': 50, 'eval_metric': 'AUC',
        'task_type': 'GPU' if DEVICE == 'gpu' else 'CPU',
    }
    model = CatBoostClassifier(**params)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

cat_study = optuna.create_study(direction='maximize')
cat_study.optimize(cat_objective, n_trials=50, show_progress_bar=True)
print(f'CAT  Best AUC: {cat_study.best_value:.4f}')

In [None]:
lgbm_params = {**lgbm_study.best_params,
               'n_estimators': 1000, 'random_state': 42, 'verbose': -1,
               'device': DEVICE}

lgbm_oof  = np.zeros(len(X))
lgbm_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = lgb.LGBMClassifier(**lgbm_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(50, verbose=False)])
    lgbm_oof[val_idx] = model.predict_proba(X_val)[:, 1]
    lgbm_test += model.predict_proba(X_test)[:, 1] / cv.n_splits
    print(f'Fold {fold+1}: AUC = {roc_auc_score(y_val, lgbm_oof[val_idx]):.4f}')

print(f'LightGBM OOF AUC: {roc_auc_score(y, lgbm_oof):.4f}')

In [None]:
xgb_params = {**xgb_study.best_params,
              'n_estimators': 1000, 'random_state': 42, 'verbosity': 0,
              'tree_method': 'hist', 'device': 'cuda' if DEVICE == 'gpu' else 'cpu',
              'early_stopping_rounds': 50, 'eval_metric': 'auc'}

xgb_oof  = np.zeros(len(X))
xgb_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    xgb_oof[val_idx] = model.predict_proba(X_val)[:, 1]
    xgb_test += model.predict_proba(X_test)[:, 1] / cv.n_splits
    print(f'Fold {fold+1}: AUC = {roc_auc_score(y_val, xgb_oof[val_idx]):.4f}')

print(f'XGBoost  OOF AUC: {roc_auc_score(y, xgb_oof):.4f}')

In [None]:
cat_params = {**cat_study.best_params,
              'iterations': 1000, 'random_seed': 42,
              'verbose': 0, 'early_stopping_rounds': 50, 'eval_metric': 'AUC',
              'task_type': 'GPU' if DEVICE == 'gpu' else 'CPU'}

cat_oof  = np.zeros(len(X))
cat_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    model = CatBoostClassifier(**cat_params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
    cat_oof[val_idx] = model.predict_proba(X_val)[:, 1]
    cat_test += model.predict_proba(X_test)[:, 1] / cv.n_splits
    print(f'Fold {fold+1}: AUC = {roc_auc_score(y_val, cat_oof[val_idx]):.4f}')

print(f'CatBoost OOF AUC: {roc_auc_score(y, cat_oof):.4f}')

In [None]:
def rank_avg(*arrays):
    ranked = [rankdata(a) / len(a) for a in arrays]
    return np.mean(ranked, axis=0)

ensemble_oof  = rank_avg(lgbm_oof, xgb_oof, cat_oof)
ensemble_test = rank_avg(lgbm_test, xgb_test, cat_test)

print(f'Ensemble OOF AUC: {roc_auc_score(y, ensemble_oof):.4f}')
print(f'  LGBM OOF AUC:   {roc_auc_score(y, lgbm_oof):.4f}')
print(f'  XGB  OOF AUC:   {roc_auc_score(y, xgb_oof):.4f}')
print(f'  CAT  OOF AUC:   {roc_auc_score(y, cat_oof):.4f}')

In [None]:
submission = pd.DataFrame({'Id': test_ids, 'Has_Valentine': ensemble_test})
submission.to_csv('submission.csv', index=False)
print(f'Saved. Mean prediction: {ensemble_test.mean():.4f}')

from google.colab import files
files.download('submission.csv')