In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
import os

FEATURE_PATH = '../'
full_train_df = pd.read_csv(os.path.join(FEATURE_PATH, 'train_features.csv'))
print("Feature-engineered training data loaded successfully")
print(f"Shape: {full_train_df.shape}")

Feature-engineered training data loaded successfully
Shape: (3043, 40)


In [2]:
# Prepare the data for modeling
features = [col for col in full_train_df.columns if col not in ['object_id', 'target', 'split', 'English Translation']]

X = full_train_df[features]
y = full_train_df['target']

# Handle missing values
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

X = pd.DataFrame(X, columns=features)

print("Data prepared for modeling:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Data prepared for modeling:
Features shape: (3043, 36)
Target shape: (3043,)


In [None]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(full_train_df))
f1_scores = []

print(f"\nStarting training with {N_SPLITS}-Fold Stratified Cross-Validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")

    # Split the data
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # LightGBM Model
    neg_count = y_train.value_counts()[0]
    pos_count = y_train.value_counts()[1]
    scale_pos_weight_value = neg_count / pos_count if pos_count > 0 else 1
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'seed': 42,
        'n_jobs': -1,
        'verbose': -1,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'scale_pos_weight': scale_pos_weight_value
    }

    model = lgb.LGBMClassifier(**lgb_params)

    # Train the model
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='f1',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    # Make predictions on the validation set
    val_preds_proba = model.predict_proba(X_val)[:, 1]
    
    val_preds_binary = (val_preds_proba > 0.5).astype(int)
    
    f1 = f1_score(y_val, val_preds_binary)
    f1_scores.append(f1)
    print(f"F1 Score for Fold {fold+1}: {f1:.4f}")

# Final Results
mean_f1 = np.mean(f1_scores)
print("\n--- Cross-Validation Summary ---")
print(f"Mean F1 Score across all folds: {mean_f1:.4f}")


Starting training with 5-Fold Stratified Cross-Validation...
--- Fold 1/5 ---
F1 Score for Fold 1: 0.1852
--- Fold 2/5 ---
F1 Score for Fold 2: 0.3137
--- Fold 3/5 ---
F1 Score for Fold 3: 0.4082
--- Fold 4/5 ---
F1 Score for Fold 4: 0.3913
--- Fold 5/5 ---
F1 Score for Fold 5: 0.1600

--- Cross-Validation Summary ---
Mean F1 Score across all folds: 0.2917


Mean F1 Score of 0.2917 is the starting point. The F1 score fluctuated considerably between folds (from a low of 0.16 to a high of 0.41). This is happening because we are dealing with imbalanced datasets where the small number of positive samples can lead to instability depending on how the data is split.

Next step would be to find the optimal probability threshold that maximizes the F1 score for each fold.