In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Lending Club Loan.csv')

In [3]:
data.head()

Unnamed: 0,emp_title,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,debt_to_income_joint,...,sub_grade,issue_month,loan_status,initial_listing_status,disbursement_method,balance,paid_total,paid_principal,paid_interest,paid_late_fees
0,global config engineer,3.0,NJ,MORTGAGE,90000.0,Verified,18.01,,,,...,C3,Mar-2018,Current,whole,Cash,27015.86,1999.33,984.14,1015.19,0.0
1,warehouse office clerk,10.0,HI,RENT,40000.0,Not Verified,5.04,,,,...,C1,Feb-2018,Current,whole,Cash,4651.37,499.12,348.63,150.49,0.0
2,assembly,3.0,WI,RENT,40000.0,Source Verified,21.15,,,,...,D1,Feb-2018,Current,fractional,Cash,1824.63,281.8,175.37,106.43,0.0
3,customer service,1.0,PA,RENT,30000.0,Not Verified,10.16,,,,...,A3,Jan-2018,Current,whole,Cash,18853.26,3312.89,2746.74,566.15,0.0
4,security supervisor,10.0,CA,RENT,35000.0,Verified,57.96,57000.0,Verified,37.66,...,C3,Mar-2018,Current,whole,Cash,21430.15,2324.65,1569.85,754.8,0.0


In [4]:
data.isnull().sum()

emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installment_accounts           0
accounts_opened_

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

columns_to_drop = ['emp_title', 'state', 'loan_purpose', 'application_type']
data_cleaned = data.drop(columns=columns_to_drop)

# For numeric columns,fill missing values with the median.
numeric_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
data_cleaned[numeric_cols] = imputer.fit_transform(data_cleaned[numeric_cols])

# For categorical columns, fill missing values with the mode (most frequent value).
categorical_cols = data_cleaned.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
data_cleaned[categorical_cols] = imputer.fit_transform(data_cleaned[categorical_cols])


label_encoder = LabelEncoder()

for col in categorical_cols:
    data_cleaned[col] = label_encoder.fit_transform(data_cleaned[col])

# Step 4: Verify the cleaning process (check for remaining missing values and data types)
print(data_cleaned.isnull().sum())  # Should print 0 for all columns
print(data_cleaned.dtypes)          # Ensure the data types are appropriate


emp_length                          0
homeownership                       0
annual_income                       0
verified_income                     0
debt_to_income                      0
annual_income_joint                 0
verification_income_joint           0
debt_to_income_joint                0
delinq_2y                           0
months_since_last_delinq            0
earliest_credit_line                0
inquiries_last_12m                  0
total_credit_lines                  0
open_credit_lines                   0
total_credit_limit                  0
total_credit_utilized               0
num_collections_last_12m            0
num_historical_failed_to_pay        0
months_since_90d_late               0
current_accounts_delinq             0
total_collection_amount_ever        0
current_installment_accounts        0
accounts_opened_24m                 0
months_since_last_credit_inquiry    0
num_satisfactory_accounts           0
num_accounts_120d_past_due          0
num_accounts

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = data_cleaned.drop(columns=['loan_status'])  # All features except target
y = data_cleaned['loan_status']  # Target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original class distribution in y_train:")
print(y_train.value_counts())

print("Resampled class distribution in y_train:")
print(y_train_resampled.value_counts())


Original class distribution in y_train:
loan_status
1    7517
2     343
5      55
3      49
4      29
0       7
Name: count, dtype: int64
Resampled class distribution in y_train:
loan_status
1    7517
2    7517
5    7517
3    7517
4    7517
0    7517
Name: count, dtype: int64


In [17]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.5 MB 279.3 kB/s eta 0:00:06
   - -------------------------------------- 0.1/1.5 MB 435.7 kB/s eta 0:00:04
   ---- ----------------------------------- 0.2/1.5 MB 702.7 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.5 MB 901.1 kB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.5 MB 999.9 kB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.5 MB 1.1 MB/s eta 0:00:01
   ------------------- -------------------- 0.7/1.5 MB 1.7 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 2.2 MB/s eta 0:00:01
   ------------------------

In [19]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

lgbm_model = lgb.LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_resampled, y_train_resampled)

y_pred_lgbm = lgbm_model.predict(X_test)

print("LightGBM Classifier - Performance Report")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgbm)}")
print(classification_report(y_test, y_pred_lgbm))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10000
[LightGBM] [Info] Number of data points in the train set: 45102, number of used features: 47
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
LightGBM Classifier - Performance Report
Accuracy: 0.9825
              precision    recall  f1-score   support

           1       0.98      1.00      0.99      1858
           2       1.00      1.00      1.00       104
           3       0.00      0.00      0.00        18
           4       0.00      0.00      0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
