In [22]:
import cudf
import cupy as cp
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

print('cudf', cudf.__version__)
print('xgboost', xgb.__version__)
print('gpus', cp.cuda.runtime.getDeviceCount())


cudf 26.02.00a239
xgboost 3.1.2
gpus 1


In [23]:
from pathlib import Path

data_dir = Path.cwd()
if not (data_dir / 'train.csv').exists():
    data_dir = Path('Predicting Student Test Scores')

print('data_dir', data_dir.resolve())

def read_csv_safe(path):
    try:
        df = cudf.read_csv(path, header=0)
    except Exception as exc:
        print(f'cudf.read_csv failed for {path}: {exc}')
        df = None
    if df is None or len(df.columns) == 0:
        try:
            import pandas as pd
            pdf = pd.read_csv(path)
            df = cudf.from_pandas(pdf)
            print('fallback: pandas -> cudf')
        except Exception as exc:
            raise RuntimeError(f'CSV read failed for {path}: {exc}')
    return df

train = read_csv_safe(data_dir / 'train.csv')
test = read_csv_safe(data_dir / 'test.csv')

required_cols = {
    'exam_score', 'study_hours', 'class_attendance', 'sleep_hours',
    'sleep_quality', 'facility_rating', 'exam_difficulty',
    'study_method', 'age', 'gender', 'course', 'internet_access'
}
missing = sorted(required_cols - set(train.columns))
if missing:
    print('train columns:', list(train.columns))
    raise ValueError(f'Missing columns in train.csv: {missing}')


data_dir /home/james/projects/Kaggle-Competitions/Predicting Student Test Scores


In [24]:
sleep_map = {'poor': 1, 'average': 2, 'good': 3}
facility_map = {'low': 1, 'medium': 2, 'high': 3}
difficulty_map = {'easy': 1, 'moderate': 2, 'hard': 3}

def add_features(df):
    df = df.copy()
    if 'age' in df.columns:
        df['age'] = df['age'].astype('str')

    if 'sleep_quality' in df.columns:
        df['sleep_quality_num'] = df['sleep_quality'].map(sleep_map)
    if 'facility_rating' in df.columns:
        df['facility_rating_num'] = df['facility_rating'].map(facility_map)
    if 'exam_difficulty' in df.columns:
        df['exam_difficulty_num'] = df['exam_difficulty'].map(difficulty_map)

    if 'class_attendance' in df.columns:
        df['attend_pct'] = df['class_attendance'] / 100.0

    if 'study_hours' in df.columns and 'attend_pct' in df.columns:
        df['effort_index'] = df['study_hours'] * df['attend_pct']
    if 'sleep_hours' in df.columns and 'sleep_quality_num' in df.columns:
        df['sleep_score'] = df['sleep_hours'] * df['sleep_quality_num']
    if 'study_hours' in df.columns and 'facility_rating_num' in df.columns:
        df['preparedness'] = df['study_hours'] * df['facility_rating_num']
    if 'study_hours' in df.columns and 'exam_difficulty_num' in df.columns:
        df['effort_vs_difficulty'] = df['study_hours'] / (df['exam_difficulty_num'] + 1)
    if 'study_hours' in df.columns and 'sleep_hours' in df.columns:
        df['study_per_sleep'] = df['study_hours'] / (df['sleep_hours'] + 1)

    if 'sleep_score' in df.columns and 'attend_pct' in df.columns:
        df['sleep_score_x_attend'] = df['sleep_score'] * df['attend_pct']
    if 'sleep_score' in df.columns and 'study_hours' in df.columns:
        df['sleep_score_x_study'] = df['sleep_score'] * df['study_hours']
    if 'sleep_score' in df.columns and 'exam_difficulty_num' in df.columns:
        df['sleep_score_x_difficulty'] = df['sleep_score'] * df['exam_difficulty_num']

    if 'attend_pct' in df.columns and 'sleep_quality_num' in df.columns:
        df['attendance_x_sleepq'] = df['attend_pct'] * df['sleep_quality_num']
    if 'attend_pct' in df.columns and 'facility_rating_num' in df.columns:
        df['attendance_x_facility'] = df['attend_pct'] * df['facility_rating_num']

    if 'study_hours' in df.columns and 'exam_difficulty_num' in df.columns:
        df['study_x_difficulty'] = df['study_hours'] * df['exam_difficulty_num']
    if 'study_hours' in df.columns and 'facility_rating_num' in df.columns:
        df['study_x_facility'] = df['study_hours'] * df['facility_rating_num']
    if 'study_hours' in df.columns and 'sleep_quality_num' in df.columns:
        df['study_x_sleepq'] = df['study_hours'] * df['sleep_quality_num']

    if 'study_hours' in df.columns and 'attend_pct' in df.columns:
        df['method_effort'] = df['study_hours'] * df['attend_pct']
    if 'attend_pct' in df.columns:
        df['method_attendance'] = df['attend_pct']
    if 'sleep_score' in df.columns:
        df['method_sleep_score'] = df['sleep_score']

    return df

train_fe = add_features(train)
test_fe = add_features(test)


In [25]:
target = 'exam_score'

cat_cols = ['study_method', 'age', 'gender', 'course', 'internet_access']
num_cols = [
    'study_hours', 'class_attendance', 'sleep_hours',
    'sleep_quality_num', 'facility_rating_num', 'exam_difficulty_num',
    'attend_pct',
    'effort_index', 'sleep_score', 'preparedness',
    'effort_vs_difficulty', 'study_per_sleep',
    'sleep_score_x_attend', 'sleep_score_x_study', 'sleep_score_x_difficulty',
    'attendance_x_sleepq', 'attendance_x_facility',
    'study_x_difficulty', 'study_x_facility', 'study_x_sleepq',
    'method_effort', 'method_attendance', 'method_sleep_score'
]

cat_cols = [c for c in cat_cols if c in train_fe.columns]
num_cols = [c for c in num_cols if c in train_fe.columns]

X_train_full = train_fe[cat_cols + num_cols]
y = train_fe[target]
X_test_full = test_fe[cat_cols + num_cols]

X_train_enc = cudf.get_dummies(X_train_full, columns=cat_cols)
X_test_enc = cudf.get_dummies(X_test_full, columns=cat_cols)

X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)


In [26]:
def run_cv(params, X, y, n_splits=5, num_boost_round=3000, early_stopping_rounds=100):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_scores = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
        X_tr = X.iloc[tr_idx]
        y_tr = y.iloc[tr_idx]
        X_va = X.iloc[va_idx]
        y_va = y.iloc[va_idx]

        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dvalid = xgb.DMatrix(X_va, label=y_va)

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )

        preds = booster.predict(dvalid)
        mse = mean_squared_error(y_va.to_pandas(), preds)
        rmse = mse ** 0.5
        rmse_scores.append(rmse)
        print(f'fold {fold} rmse: {rmse:.4f} | best_iter: {booster.best_iteration}')

    avg_rmse = sum(rmse_scores) / len(rmse_scores)
    return rmse_scores, avg_rmse


In [27]:
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'device': 'cuda',
    'eval_metric': 'rmse',
    'seed': 42,
}

search_space = {
    'learning_rate': [0.02, 0.03, 0.05],
    'max_depth': [8, 10, 12],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'gamma': [0.0, 0.5, 1.0],
    'lambda': [0.5, 1.0, 2.0],
    'alpha': [0.0, 0.5, 1.0],
}

from itertools import product
keys = list(search_space.keys())
values = [search_space[k] for k in keys]

results = []
max_trials = 20  # bump this up if you want a longer run

for i, combo in enumerate(product(*values)):
    if i >= max_trials:
        break
    params = base_params.copy()
    params.update(dict(zip(keys, combo)))

    print(f'\nTrial {i+1}/{max_trials}:', params)
    scores, avg_rmse = run_cv(params, X_train_enc, y, n_splits=5, num_boost_round=3000, early_stopping_rounds=100)
    results.append((avg_rmse, params))
    print(f'avg rmse: {avg_rmse:.4f}')

results = sorted(results, key=lambda x: x[0])
best_rmse, best_params = results[0]
best_rmse, best_params



Trial 1/20: {'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'rmse', 'seed': 42, 'learning_rate': 0.02, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.0, 'lambda': 0.5, 'alpha': 0.0}
fold 1 rmse: 8.7438 | best_iter: 1747
fold 2 rmse: 8.7462 | best_iter: 2037
fold 3 rmse: 8.7329 | best_iter: 2065
fold 4 rmse: 8.7516 | best_iter: 2097
fold 5 rmse: 8.7727 | best_iter: 1905
avg rmse: 8.7494

Trial 2/20: {'objective': 'reg:squarederror', 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'rmse', 'seed': 42, 'learning_rate': 0.02, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.0, 'lambda': 0.5, 'alpha': 0.5}
fold 1 rmse: 8.7425 | best_iter: 1933
fold 2 rmse: 8.7447 | best_iter: 1975
fold 3 rmse: 8.7339 | best_iter: 2098
fold 4 rmse: 8.7515 | best_iter: 2220
fold 5 rmse: 8.7735 | best_iter: 1668
avg rmse: 8.7492

Trial 3/20: {'objective': 'reg:squarederror

(8.748390149562654,
 {'objective': 'reg:squarederror',
  'tree_method': 'hist',
  'device': 'cuda',
  'eval_metric': 'rmse',
  'seed': 42,
  'learning_rate': 0.02,
  'max_depth': 8,
  'min_child_weight': 1,
  'subsample': 0.7,
  'colsample_bytree': 0.7,
  'gamma': 0.5,
  'lambda': 2.0,
  'alpha': 0.5})

In [28]:
params = globals().get('best_params', None) or {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'device': 'cuda',
    'learning_rate': 0.03,
    'max_depth': 10,
    'min_child_weight': 5,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'gamma': 0.0,
    'lambda': 1.0,
    'alpha': 0.0,
    'eval_metric': 'rmse',
    'seed': 42,
}

dtrain_full = xgb.DMatrix(X_train_enc, label=y)
dtest = xgb.DMatrix(X_test_enc)

booster = xgb.train(
    params=params,
    dtrain=dtrain_full,
    num_boost_round=500,
    verbose_eval=False,
)

test_preds = booster.predict(dtest)

test_preds = test_preds.clip(0, 100)
test_preds = test_preds.round(1)

submission = cudf.DataFrame({
    'id': test['id'],
    'exam_score': test_preds
})

submission.to_csv('submission_wsl_xgb.csv', index=False)
submission.head()


Unnamed: 0,id,exam_score
0,630000,71.800003
1,630001,69.900002
2,630002,87.599998
3,630003,54.799999
4,630004,47.400002
