In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

TARGET = "is_cheater"
ID_COL = "id"

feature_cols = [c for c in train.columns if c not in [TARGET, ID_COL]]

num_cols = train[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train[feature_cols].select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", len(num_cols))
print(num_cols)
print("\nCategorical columns:", len(cat_cols))
print(cat_cols)

num_imputer = SimpleImputer(strategy="mean")
train[num_cols] = num_imputer.fit_transform(train[num_cols])
test[num_cols]  = num_imputer.transform(test[num_cols])

if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
    test[cat_cols]  = cat_imputer.transform(test[cat_cols])

for col in cat_cols:
    le = LabelEncoder()
    all_labels = pd.concat([train[col], test[col]]).unique()
    le.fit(all_labels)
    train[col] = le.transform(train[col])
    test[col]  = le.transform(test[col])

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols]  = scaler.transform(test[num_cols])

X_train = train[feature_cols]
y_train = train[TARGET]
X_test  = test[feature_cols]

print("\nFinal shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)


Train shape: (99872, 34)
Test shape: (25889, 33)
Numeric columns: 31
['kill_death_ratio', 'headshot_percentage', 'win_rate', 'accuracy_score', 'kill_consistency', 'reaction_time_ms', 'account_age_days', 'level', 'level_progression_speed', 'friend_network_size', 'reports_received', 'device_changes_count', 'input_consistency_score', 'avg_session_length_min', 'sessions_per_day', 'night_play_ratio', 'weapon_switch_speed', 'movement_pattern_score', 'aiming_smoothness', 'spray_control_score', 'game_sense_score', 'communication_rate', 'team_play_score', 'buy_decision_score', 'map_knowledge', 'clutch_success_rate', 'first_blood_rate', 'survival_time_avg', 'damage_per_round', 'utility_usage_rate', 'crosshair_placement']

Categorical columns: 1
['player_id']

Final shapes:
X_train: (99872, 32)
y_train: (99872,)
X_test: (25889, 32)


In [None]:
# ============================================================
# TASK 1 – Advanced LightGBM using all slide hints
#   - class imbalance (class_weight)
#   - feature engineering (interaction feature)
#   - cross-validation
#   - decision threshold tuning
#   - ensemble across folds
# ============================================================

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import lightgbm as lgb
import pandas as pd # Import pandas for isnull()

# 1) (Optional) Feature Engineering: interaction feature
#    Example from slide: skill_consistency = game_sense / accuracy
if "game_sense_score" in train.columns and "accuracy_score" in train.columns:
    eps = 1e-6
    train["skill_consistency"] = train["game_sense_score"] / (train["accuracy_score"] + eps)
    test["skill_consistency"]  = test["game_sense_score"]  / (test["accuracy_score"]  + eps)

# Rebuild feature list to include the new feature
TARGET = "is_cheater"
ID_COL = "id"
feature_cols = [c for c in train.columns if c not in [TARGET, ID_COL]]

X_train = train[feature_cols]
y_train = train[TARGET]
X_test  = test[feature_cols]

print("Using", len(feature_cols), "features (including engineered ones if any).")

# Handle NaNs in y_train before computing class weights and cross-validation
nan_mask = y_train.isnull()
X_train_cleaned = X_train[~nan_mask]
y_train_cleaned = y_train[~nan_mask]

# 2) Class imbalance handling: compute class weights and give to LightGBM
classes = np.unique(y_train_cleaned) # Use cleaned y_train
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train_cleaned)
class_weight_dict = {int(c): float(w) for c, w in zip(classes, class_weights)}
print("Class weights:", class_weight_dict)

# 3) Stratified K-Fold Cross-Validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_proba = np.zeros(len(X_train_cleaned))
test_proba = np.zeros(len(X_test))
best_thresholds = []

# Iterate over cleaned data for cross-validation
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_cleaned, y_train_cleaned), 1):
    print(f"\n===== Fold {fold}/{n_splits} ===telek_p")
    X_tr, X_val = X_train_cleaned.iloc[tr_idx], X_train_cleaned.iloc[val_idx]
    y_tr, y_val = y_train_cleaned.iloc[tr_idx], y_train_cleaned.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=800,
        learning_rate=0.03,
        num_leaves=64,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42 + fold,
        n_jobs=-1,
        class_weight=class_weight_dict  # <-- handle class imbalance
    )

    model.fit(X_tr, y_tr)

    # Probabilities on validation and test
    val_proba = model.predict_proba(X_val)[:, 1]
    test_fold_proba = model.predict_proba(X_test)[:, 1]

    # Store out-of-fold probabilities for global metrics
    oof_proba[val_idx] = val_proba
    test_proba += test_fold_proba / n_splits  # ensemble across folds

    # 4) Decision threshold tuning (maximize F1 on this fold)
    thresholds = np.linspace(0.1, 0.9, 17)
    best_f1 = -1
    best_t = 0.5
    for t in thresholds:
        preds_t = (val_proba >= t).astype(int)
        f1_t = f1_score(y_val, preds_t)
        if f1_t > best_f1:
            best_f1 = f1_t
            best_t = t

    best_thresholds.append(best_t)

    # Report fold metrics with its best threshold
    val_preds_best = (val_proba >= best_t).astype(int)
    acc = accuracy_score(y_val, val_preds_best)
    auc = roc_auc_score(y_val, val_proba)

    print(f"  Fold {fold} best threshold: {best_t:.3f}")
    print(f"  Fold {fold} Accuracy: {acc:.4f}")
    print(f"  Fold {fold} F1:       {best_f1:.4f}")
    print(f"  Fold {fold} ROC AUC:  {auc:.4f}")

# 5) Global evaluation on all OOF predictions
global_threshold = float(np.mean(best_thresholds))
print("\nAverage best threshold across folds:", global_threshold)

oof_preds = (oof_proba >= global_threshold).astype(int)
global_acc = accuracy_score(y_train_cleaned, oof_preds) # Use cleaned y_train
global_f1  = f1_score(y_train_cleaned, oof_preds)       # Use cleaned y_train
global_auc = roc_auc_score(y_train_cleaned, oof_proba)   # Use cleaned y_train

print("\n===== Overall CV Metrics (OOF) ===telek_p")
print(f"  Accuracy: {global_acc:.4f}")
print(f"  F1 Score: {global_f1:.4f}")
print(f"  ROC AUC:  {global_auc:.4f}")

# 6) Final predictions for test set using ensemble probabilities + tuned threshold
test_preds = (test_proba >= global_threshold).astype(int)

submission = pd.DataFrame({
    "id": test["id"],
    "task1": test_preds.astype(int)
})

submission.to_csv("submission_task1_advanced.csv", index=False)
print("\nSaved advanced submission as submission_task1_advanced.csv")
submission.head()


Using 33 features (including engineered ones if any).
Class weights: {0: 0.768229616938336, 1: 1.4320372703565882}

===== Fold 1/5 ===telek_p
[LightGBM] [Info] Number of positive: 27303, number of negative: 50895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7949
[LightGBM] [Info] Number of data points in the train set: 78198, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499999 -> initscore=-0.000003
[LightGBM] [Info] Start training from score -0.000003
  Fold 1 best threshold: 0.500
  Fold 1 Accuracy: 0.7873
  Fold 1 F1:       0.7257
  Fold 1 ROC AUC:  0.8954

===== Fold 2/5 ===telek_p
[LightGBM] [Info] Number of positive: 27303, number of negative: 50895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023265 seconds.
You can set `force_col_wise=true` to remove the overh

Unnamed: 0,id,task1
0,ANS00001,1
1,ANS00002,0
2,ANS00003,1
3,ANS00004,0
4,ANS00005,1


In [None]:
# ============================================================
# TASK 1 — LightGBM Training, Validation, and Submission
# ============================================================

!pip install lightgbm --quiet

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import lightgbm as lgb
import pandas as pd

# Handle NaNs in y_train before splitting
nan_mask = y_train.isnull()
X_train_cleaned = X_train[~nan_mask]
y_train_cleaned = y_train[~nan_mask]

# -----------------------------
# 1. Train/Validation Split
# -----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_cleaned, y_train_cleaned,
    test_size=0.2,
    random_state=42,
    stratify=y_train_cleaned
)

print("Training split:", X_tr.shape, y_tr.shape)
print("Validation split:", X_val.shape, y_val.shape)

# -----------------------------
# 2. LightGBM Model
# -----------------------------
model = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# -----------------------------
# 3. Train on Train Split
# -----------------------------
model.fit(X_tr, y_tr)

# -----------------------------
# 4. Validate
# -----------------------------
val_preds = model.predict(X_val)
val_proba = model.predict_proba(X_val)[:, 1]

acc = accuracy_score(y_val, val_preds)
f1  = f1_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_proba)

print("\nValidation Metrics:")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1 Score: {f1:.4f}")
print(f"  ROC AUC:  {auc:.4f}")

# -----------------------------
# 5. Retrain on FULL training set
# -----------------------------
final_model = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_cleaned, y_train_cleaned)

# -----------------------------
# 6. Predict on Test
# -----------------------------
test_preds = final_model.predict(X_test)

# -----------------------------
# 7. Create Submission File
# -----------------------------
submission = pd.DataFrame({
    "id": test["id"],
    "task1": test_preds.astype(int)
})

submission.to_csv("submission_task1.csv", index=False)
print("\nSaved submission as submission_task1.csv")
submission.head()

Training split: (78198, 32) (78198,)
Validation split: (19550, 32) (19550,)
[LightGBM] [Info] Number of positive: 27303, number of negative: 50895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7694
[LightGBM] [Info] Number of data points in the train set: 78198, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.349152 -> initscore=-0.622768
[LightGBM] [Info] Start training from score -0.622768

Validation Metrics:
  Accuracy: 0.8080
  F1 Score: 0.6914
  ROC AUC:  0.8981
[LightGBM] [Info] Number of positive: 34129, number of negative: 63619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7694
[LightGBM] [Info] Number of data points in the train set: 97748, number of used featu

Unnamed: 0,id,task1
0,ANS00001,1
1,ANS00002,0
2,ANS00003,0
3,ANS00004,0
4,ANS00005,0
