In [19]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt


In [20]:
df = pd.read_csv('/Users/harshitagarwal/Desktop/Kaggle Competitions/Predicting Loan Payback/train.csv')

In [21]:
df.head(5)

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [22]:
df.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442236,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [24]:
train = pd.read_csv('/Users/harshitagarwal/Desktop/Kaggle Competitions/Predicting Loan Payback/train.csv')
test = pd.read_csv('/Users/harshitagarwal/Desktop/Kaggle Competitions/Predicting Loan Payback/test.csv')

In [25]:
target = 'loan_paid_back'
test_id = test['id'].copy()

In [26]:
X = train.drop(target, axis=1)
y = train[target]
test_X = test.copy()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2 ,random_state=42)


In [27]:
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "n_estimators": 8000,
    "num_leaves": 64,
    "max_depth": -1,
    "min_child_samples": 50,
    "subsample": 0.8,
    "subsample_freq": 1,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 5.0,
    "scale_pos_weight": (len(y) - y.sum()) / y.sum(),
    "random_state": 42,
    "verbosity": -1,
    "n_jobs": -1
}


In [28]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_X))
fold_auc = []

In [29]:
N_SPLITS = 7
skf = StratifiedKFold(n_splits = N_SPLITS, shuffle=True, random_state = 42)

In [30]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    X[col] = X[col].astype("category")
    test_X[col] = test_X[col].astype("category")
cat_features = [X.columns.get_loc(col) for col in cat_cols]

In [31]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LGBMClassifier(**lgb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        categorical_feature=cat_features,
        callbacks=[
            early_stopping(stopping_rounds=300),
            log_evaluation(200)
        ]
    )

    val_pred = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred

    auc = roc_auc_score(y_val, val_pred)
    fold_auc.append(auc)
    print(f"AUC: {auc:.5f}")

    test_preds += model.predict_proba(test_X)[:, 1] / N_SPLITS



Fold 1
Training until validation scores don't improve for 300 rounds
[200]	valid_0's auc: 0.922033
[400]	valid_0's auc: 0.922991
[600]	valid_0's auc: 0.923269
[800]	valid_0's auc: 0.923293
Early stopping, best iteration is:
[688]	valid_0's auc: 0.923329
AUC: 0.92333

Fold 2
Training until validation scores don't improve for 300 rounds
[200]	valid_0's auc: 0.921083
[400]	valid_0's auc: 0.922091
[600]	valid_0's auc: 0.922356
[800]	valid_0's auc: 0.922499
[1000]	valid_0's auc: 0.922512
[1200]	valid_0's auc: 0.922521
[1400]	valid_0's auc: 0.922478
Early stopping, best iteration is:
[1149]	valid_0's auc: 0.922554
AUC: 0.92255

Fold 3
Training until validation scores don't improve for 300 rounds
[200]	valid_0's auc: 0.920652
[400]	valid_0's auc: 0.921668
[600]	valid_0's auc: 0.921966
[800]	valid_0's auc: 0.92209
[1000]	valid_0's auc: 0.922047
Early stopping, best iteration is:
[795]	valid_0's auc: 0.922102
AUC: 0.92210

Fold 4
Training until validation scores don't improve for 300 rounds
[2

In [32]:
pd.DataFrame({
    'id' : test_id,
    target : test_preds
}).to_csv('LGBM_submission.csv', index=False)