In [1]:
!pip install lightgbm



In [2]:
# Kaggle-ready ML pipeline

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

In [4]:
# Update file paths for Kaggle
train_df = pd.read_csv('/kaggle/input/ml-ai-hackathon-2025/train.csv')
train_labels_df = pd.read_csv('/kaggle/input/ml-ai-hackathon-2025/train_labels.csv')
test_df = pd.read_csv('/kaggle/input/ml-ai-hackathon-2025/test.csv')

In [6]:
# Merge labels with training data
train_labeled = pd.merge(train_labels_df, train_df, on='Id')

In [7]:
# Prepare feature and label sets
X = train_labeled.drop(columns=['Id', 'Class_x', 'Class_y'], errors='ignore')
y = train_labeled['Class_x']
X_test = test_df.drop(columns=['Id'], errors='ignore')

In [9]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

In [10]:
# Feature selection
k_best = SelectKBest(score_func=f_classif, k=100)  # Tune k as needed
X_selected = k_best.fit_transform(X_imputed, y)
X_test_selected = k_best.transform(X_test_imputed)

In [11]:
# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_preds = np.zeros((X_test.shape[0], len(np.unique(y))))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_selected, y)):
    X_train, X_val = X_selected[train_idx], X_selected[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=len(np.unique(y)),
        metric='multi_logloss',
        random_state=42,
        n_estimators=500,
        learning_rate=0.05
    )

    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])

    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    val_scores.append(acc)

    test_preds += model.predict_proba(X_test_selected) / skf.n_splits

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3922
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 100
[LightGBM] [Info] Start training from score -1.742969
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -0.980829
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -1.696449
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3901
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 100
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.742969
[LightGBM] [Info] Start training from score -0

In [12]:
# Final test predictions
final_predictions = np.argmax(test_preds, axis=1)
# If your classes are 1-indexed, add 1 to predictions:
# final_predictions = final_predictions + 1

In [13]:
# Submission file
submission_df = pd.DataFrame({'Id': test_df['Id'], 'Class': final_predictions})
submission_df.to_csv('submission.csv', index=False)

In [14]:
# Optional: print cross-validation score
print(f"Validation Accuracy Scores: {val_scores}")
print(f"Mean Validation Accuracy: {np.mean(val_scores):.4f}")

Validation Accuracy Scores: [1.0, 0.9666666666666667, 1.0, 1.0, 1.0]
Mean Validation Accuracy: 0.9933
