In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier

In [32]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/thapar-kaggle-hack-v-01/X_train.csv")
test_df = pd.read_csv("/kaggle/input/thapar-kaggle-hack-v-01/X_test.csv")
sample_submission = pd.read_csv("/kaggle/input/thapar-kaggle-hack-v-01/sample_submission.csv") 

In [33]:
# Separate features and target
X = train_df.drop(columns=["target"])
y = train_df["target"]

# Store test IDs and remove 'id' column
if "id" in test_df.columns:
    test_ids = test_df["id"]
    test_df = test_df.drop(columns=["id"])
else:
    test_ids = None

# Identify categorical features
cat_features = ["feature_10", "feature_11", "feature_12"]

In [34]:
# Handle missing values
imputer = SimpleImputer(strategy="most_frequent")
X[cat_features] = imputer.fit_transform(X[cat_features])
test_df[cat_features] = imputer.transform(test_df[cat_features])

# Encode categorical features
encoder = LabelEncoder()
for col in cat_features:
    X[col] = encoder.fit_transform(X[col])
    test_df[col] = encoder.transform(test_df[col])

# Handle missing values for numeric data
num_imputer = SimpleImputer(strategy="mean")
X.iloc[:, :] = num_imputer.fit_transform(X)
test_df.iloc[:, :] = num_imputer.transform(test_df)

In [35]:
# Normalize data
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_df = scaler.transform(test_df)

In [36]:
# Split dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# Train the LightGBM model with optimized hyperparameters
model = lgb.LGBMClassifier(
    n_estimators=500,         # Number of boosting rounds
    max_depth=10,             # Tree depth for complexity control
    learning_rate=0.05,       # Learning rate for better convergence
    subsample=0.8,            # Use 80% of data per boosting iteration
    colsample_bytree=0.8,     # Use 80% of features per tree
    reg_lambda=1,             # L2 regularization
    num_leaves=50,            # Number of leaves in one tree
    min_child_samples=20,     # Minimum data per leaf
    random_state=42,
    force_col_wise=True
    
)

model.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 5622
[LightGBM] [Info] Number of data points in the train set: 12800, number of used features: 25
[LightGBM] [Info] Start training from score -1.619252
[LightGBM] [Info] Start training from score -2.562530
[LightGBM] [Info] Start training from score -2.197381
[LightGBM] [Info] Start training from score -2.379702
[LightGBM] [Info] Start training from score -2.243432
[LightGBM] [Info] Start training from score -2.967996
[LightGBM] [Info] Start training from score -2.503516
[LightGBM] [Info] Start training from score -2.107327
[LightGBM] [Info] Start training from score -2.795346
[LightGBM] [Info] Start training from score -2.310428


In [42]:
# Evaluate accuracy
y_val_pred = stacking_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Make predictions on test set
test_preds = stacking_model.predict(test_df) 

Validation Accuracy: 0.6500


In [43]:
# Create submission file
submission = pd.DataFrame({"id": test_ids, "target": test_preds})
submission.to_csv("submission.csv", index=False)

print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'
