In [None]:

# Train SVM using train_preprocessed.csv + predict on test_preprocessed.csv

import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import joblib

# --------------------
# FILENAMES
# --------------------
TRAIN_FILE = "train_preprocessed.csv"
TEST_FILE  = "test_preprocessed.csv"
OUT_FILE   = "submission_svm_binary.csv"

# --------------------
# LOAD DATA
# --------------------
train = pd.read_csv(TRAIN_FILE)
test  = pd.read_csv(TEST_FILE)

TARGET = "RiskFlag"
IDCOL  = "ProfileID"

# X and y from train
X_train = train.drop(columns=[TARGET, IDCOL])
y_train = train[TARGET].astype(int)

# X_test from test
X_test = test.drop(columns=[IDCOL], errors="ignore")

# Make sure train & test columns match
missing_cols = [c for c in X_train.columns if c not in X_test.columns]
for c in missing_cols:
    X_test[c] = 0.0

extra_cols = [c for c in X_test.columns if c not in X_train.columns]
if extra_cols:
    X_test.drop(columns=extra_cols, inplace=True)

X_test = X_test[X_train.columns]

print("Shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test :", X_test.shape)

# --------------------
# TRAIN SVM (CALIBRATED)
# --------------------
print("\nTraining Calibrated Linear SVM...")

base_svm = LinearSVC(C=1.0, max_iter=5000, dual=False)   # simple fixed C=1, stable

model = CalibratedClassifierCV(base_svm, cv=5, method="sigmoid")
model.fit(X_train, y_train)

print("Training done.")

# --------------------
# PREDICT PROBABILITIES
# --------------------
probs = model.predict_proba(X_test)[:, 1]

# --------------------
# CONVERT TO BINARY (KAGGLE NEEDS 0/1)
# --------------------
threshold = 0.5
binary_preds = (probs >= threshold).astype(int)

# --------------------
# SAVE SUBMISSION
# --------------------
submission = pd.DataFrame({
    "ProfileID": test[IDCOL],
    "RiskFlag": binary_preds
})

submission.to_csv(OUT_FILE, index=False)
print("\nSaved Kaggle submission:", OUT_FILE)
print(submission.head(10))


Shapes:
X_train: (204277, 31)
y_train: (204277,)
X_test : (51070, 31)

Training Calibrated Linear SVM...
Training done.

Saved Kaggle submission: submission_svm_binary.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  JGFUSOIUH7         0
3  4538THBHOX         0
4  DXLNA06JHR         0
5  GPXN95JGCL         0
6  FIIMGYJEVG         0
7  22OCHURLWV         0
8  OKUSLEJ0SG         0
9  CL8B73H82O         0
