In [7]:
import pandas as pd
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_path = '/kaggle/input/credit-score-classification/train.csv'
test_path = '/kaggle/input/credit-score-classification/test.csv'

train_data = pd.read_csv('/kaggle/input/credit-score-classification/train.csv', low_memory=False)
test_data = pd.read_csv('/kaggle/input/credit-score-classification/test.csv', low_memory=False)

# Debug column names
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)

# Clean column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

# Verify if target column exists
if 'Credit_Score' not in train_data.columns:
    raise KeyError("The target column 'Credit_Score' is missing in the training data.")

# Preprocessing
# Handle mixed types
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].astype(str)
        if col in test_data.columns:
            test_data[col] = test_data[col].astype(str)

# Fill missing values
train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

# Encode categorical features
categorical_columns = train_data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    if col != 'Credit_Score':  # Skip the target column
        le = LabelEncoder()
        combined_data = pd.concat([train_data[col], test_data[col]], axis=0)
        le.fit(combined_data.astype(str))
        train_data[col] = le.transform(train_data[col].astype(str))
        if col in test_data.columns:
            test_data[col] = le.transform(test_data[col].astype(str))
        label_encoders[col] = le

# Encode target variable
target_encoder = LabelEncoder()
train_data['Credit_Score'] = target_encoder.fit_transform(train_data['Credit_Score'])

# Feature-target split
X = train_data.drop(columns=['Credit_Score'])
y = train_data['Credit_Score']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM Dataset
train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)

# LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(target_encoder.classes_),
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
}

# Train LightGBM model
# Train LightGBM model with proper callbacks
lgbm_model = lgb.train(
    params,
    train_dataset,
    num_boost_round=1000,
    valid_sets=[train_dataset, val_dataset],
    callbacks=[
        early_stopping(stopping_rounds=50),  # Early stopping callback
        log_evaluation(100),  # Log evaluation every 100 rounds
    ],
)

# Evaluate model
y_val_pred = lgbm_model.predict(X_val)
y_val_pred_classes = y_val_pred.argmax(axis=1)

print("Classification Report:")
print(classification_report(y_val, y_val_pred_classes, target_names=target_encoder.classes_))

# AUC Score
auc = roc_auc_score(pd.get_dummies(y_val), y_val_pred, multi_class="ovr")
print(f"Validation AUC Score: {auc:.4f}")

# Predict test set
if 'Credit_Score' in test_data.columns:
    test_data = test_data.drop(columns=['Credit_Score'])  # Drop target if it exists in test

test_predictions = lgbm_model.predict(test_data)
test_classes = test_predictions.argmax(axis=1)
test_data['Predicted_Credit_Score'] = target_encoder.inverse_transform(test_classes)

# Save results to a writable directory
test_data[['Predicted_Credit_Score']].to_csv('/kaggle/working/predicted_credit_scores.csv', index=False)


Train Data Columns: Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')
Test Data Columns: Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
     