In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from lightgbm import early_stopping
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import warnings
warnings.filterwarnings('ignore')

In [16]:
# load data
train_df = pd.read_csv('/content/Training_TriGuard.csv')
test_df = pd.read_csv('/content/Testing_TriGuard.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')


In [17]:
# prepare features from target
target_col = 'subrogation'
id_col = 'claim_number'

# drop rows where target is missing
train_df = train_df.dropna(subset=[target_col])

y = train_df[target_col]
X = train_df.drop(columns=[target_col])
X_test = test_df.copy()

In [18]:
# train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [19]:
#encode categorical columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X_train.select_dtypes(exclude=['object', 'category']).columns.tolist()

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit-transform on training
X_train_cat = pd.DataFrame(
    encoder.fit_transform(X_train[cat_cols]),
    columns=encoder.get_feature_names_out(cat_cols),
    index=X_train.index
)
X_val_cat = pd.DataFrame(
    encoder.transform(X_val[cat_cols]),
    columns=encoder.get_feature_names_out(cat_cols),
    index=X_val.index
)
X_test_cat = pd.DataFrame(
    encoder.transform(X_test[cat_cols]),
    columns=encoder.get_feature_names_out(cat_cols),
    index=X_test.index
)

# Combine categorical + numeric features
X_train_encoded = pd.concat([X_train[num_cols].reset_index(drop=True),
                             X_train_cat.reset_index(drop=True)], axis=1)
X_val_encoded = pd.concat([X_val[num_cols].reset_index(drop=True),
                           X_val_cat.reset_index(drop=True)], axis=1)
X_test_encoded = pd.concat([X_test[num_cols].reset_index(drop=True),
                            X_test_cat.reset_index(drop=True)], axis=1)



In [20]:
# SMOTE
sm = SMOTE(random_state=42, sampling_strategy=0.5)
X_train_res, y_train_res = sm.fit_resample(X_train_encoded, y_train)
print(f"After SMOTE → Positive class proportion: {y_train_res.mean():.3f}")



After SMOTE → Positive class proportion: 0.333


In [None]:
# Cross validate LIGHTGBM
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
val_preds_cv = np.zeros(len(X_val_encoded))
test_preds_cv = np.zeros(len(X_test_encoded))

# Track F1 scores for each fold
fold_f1_scores = []
fold_numbers = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_res, y_train_res)):
    print(f"\n===== Fold {fold + 1} =====")
    X_tr, X_vl = X_train_res.iloc[train_idx], X_train_res.iloc[val_idx]
    y_tr, y_vl = y_train_res.iloc[train_idx], y_train_res.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        colsample_bytree=0.8,
        subsample=0.8,
        random_state=42
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_vl, y_vl)],
        eval_metric='f1',
        callbacks=[early_stopping(stopping_rounds=50, verbose=True)]
    )

    # Get predictions for this fold on validation set
    val_pred_fold = model.predict_proba(X_val_encoded)[:, 1]
    test_pred_fold = model.predict_proba(X_test_encoded)[:, 1]
    
    # Find best threshold for this fold's validation predictions
    thresholds = np.linspace(0.1, 0.9, 50)
    f1_scores_fold = [f1_score(y_val, (val_pred_fold > t).astype(int)) for t in thresholds]
    best_thresh_fold = thresholds[np.argmax(f1_scores_fold)]
    best_f1_fold = max(f1_scores_fold)
    
    # Store F1 score for this fold
    fold_f1_scores.append(best_f1_fold)
    fold_numbers.append(fold + 1)
    
    print(f"Fold {fold + 1} - Best Threshold: {best_thresh_fold:.3f}, F1 Score: {best_f1_fold:.4f}")

    val_preds_cv += val_pred_fold / kf.n_splits
    test_preds_cv += test_pred_fold / kf.n_splits

# Calculate overall F1 score
thresholds_overall = np.linspace(0.1, 0.9, 50)
f1_scores_overall = [f1_score(y_val, (val_preds_cv > t).astype(int)) for t in thresholds_overall]
best_f1_overall = max(f1_scores_overall)
fold_f1_scores.append(best_f1_overall)
fold_numbers.append('Overall')

print(f"\n===== Summary =====")
print(f"Individual Fold F1 Scores: {[f'{f:.4f}' for f in fold_f1_scores[:-1]]}")
print(f"Overall F1 Score: {best_f1_overall:.4f}")




===== Fold 1 =====
[LightGBM] [Info] Number of positive: 4443, number of negative: 8885
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 13328, number of used features: 448
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333358 -> initscore=-0.693035
[LightGBM] [Info] Start training from score -0.693035
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[158]	valid_0's binary_logloss: 0.340745

===== Fold 2 =====
[LightGBM] [Info] Number of positive: 4443, number of negative: 8885
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14625

In [None]:
#threshold F1
thresholds = np.linspace(0.1, 0.9, 50)
f1_scores = [f1_score(y_val, (val_preds_cv > t).astype(int)) for t in thresholds]
best_thresh = thresholds[np.argmax(f1_scores)]
print(f"\nBest Threshold = {best_thresh:.3f} | F1 = {max(f1_scores):.4f}")

# Create bar chart comparing F1 scores across folds
plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(fold_f1_scores)), fold_f1_scores, 
               color=['steelblue'] * (len(fold_f1_scores) - 1) + ['darkorange'])

# Customize the chart
plt.xlabel('Fold Number', fontsize=12, fontweight='bold')
plt.ylabel('F1 Score', fontsize=12, fontweight='bold')
plt.title('F1 Score Comparison Across Cross-Validation Folds', fontsize=14, fontweight='bold')
plt.xticks(range(len(fold_f1_scores)), fold_numbers, fontsize=10)
plt.ylim([min(fold_f1_scores) * 0.95, max(fold_f1_scores) * 1.05])

# Add value labels on bars
for i, (bar, score) in enumerate(zip(bars, fold_f1_scores)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f'{score:.4f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

# Add grid for better readability
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Add legend
legend_elements = [Patch(facecolor='steelblue', label='Individual Fold F1'),
                   Patch(facecolor='darkorange', label='Overall F1')]
plt.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

# Print statistics
print(f"\nF1 Score Statistics:")
print(f"  Mean (Folds 1-5): {np.mean(fold_f1_scores[:-1]):.4f}")
print(f"  Std Dev (Folds 1-5): {np.std(fold_f1_scores[:-1]):.4f}")
print(f"  Min: {np.min(fold_f1_scores[:-1]):.4f}")
print(f"  Max: {np.max(fold_f1_scores[:-1]):.4f}")




Best Threshold = 0.296 | F1 = 0.5739


In [23]:
# Final evaluation
val_final = (val_preds_cv > best_thresh).astype(int)
print("\nValidation Results:")
print(classification_report(y_val, val_final, digits=4))




Validation Results:
              precision    recall  f1-score   support

         0.0     0.8938    0.7944    0.8412      2777
         1.0     0.4956    0.6817    0.5739       823

    accuracy                         0.7686      3600
   macro avg     0.6947    0.7380    0.7075      3600
weighted avg     0.8028    0.7686    0.7801      3600



In [24]:
#submission
submission = pd.DataFrame({
    id_col: test_df[id_col],
    target_col: (test_preds_cv > best_thresh).astype(int)
})

# Ensure claim_number column exists
if id_col not in submission.columns:
    raise KeyError(f"'{id_col}' column not found in submission!")

submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file created: 'submission.csv'")
print(submission.head())



✅ Submission file created: 'submission.csv'
   claim_number  subrogation
0       3126034            0
1       7380142            0
2       4655051            0
3       6728725            1
4       9848460            1


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
