<a href="https://colab.research.google.com/github/Kellozr/Lending_Club_Dataset/blob/main/predict_the_loan_grade_using_the_Lending_Club_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Common Setup for All Models (run once before the models):

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv("/content/loan.csv")

# Safe pre-loan features
safe_features = [
    'loan_amnt', 'term',  'emp_length', 'home_ownership',
    'annual_inc', 'verification_status', 'purpose', 'dti', 'open_acc', 'revol_util'
]

df = df.dropna(subset=['grade'])
df = df[safe_features + ['grade']]

# Numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute and encode
df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')
    df[col] = LabelEncoder().fit_transform(df[col])

# Encode target
df['grade'] = LabelEncoder().fit_transform(df['grade'])

# Split
X = df.drop('grade', axis=1)
y = df['grade']
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 Logistic Regression")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 Logistic Regression
Train Accuracy: 0.41146131805157593
Test Accuracy : 0.41987914148780997
Inference Time (s): 0.0011

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.77      0.59      1409
           1       0.40      0.31      0.35      1400
           2       0.35      0.41      0.38      1182
           3       0.26      0.03      0.05       594
           4       0.31      0.02      0.04       211
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1

    accuracy                           0.42      4799
   macro avg       0.26      0.22      0.20      4799
weighted avg       0.39      0.42      0.37      4799



In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 SVM")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 SVM
Train Accuracy: 0.4381349309716072
Test Accuracy : 0.4200875182329652
Inference Time (s): 4.7388

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.78      0.59      1409
           1       0.38      0.34      0.36      1400
           2       0.35      0.37      0.36      1182
           3       0.40      0.03      0.06       594
           4       0.50      0.00      0.01       211
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1

    accuracy                           0.42      4799
   macro avg       0.30      0.22      0.20      4799
weighted avg       0.41      0.42      0.37      4799



In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 Random Forest")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 Random Forest
Train Accuracy: 1.0
Test Accuracy : 0.40800166701396123
Inference Time (s): 0.1326

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.66      0.57      1409
           1       0.36      0.39      0.38      1400
           2       0.35      0.33      0.34      1182
           3       0.28      0.13      0.18       594
           4       0.38      0.09      0.15       211
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1

    accuracy                           0.41      4799
   macro avg       0.27      0.23      0.23      4799
weighted avg       0.39      0.41      0.39      4799



In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 KNN")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 KNN
Train Accuracy: 0.551133107580099
Test Accuracy : 0.36757657845384456
Inference Time (s): 1.2665

Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.62      0.51      1409
           1       0.34      0.37      0.35      1400
           2       0.32      0.26      0.29      1182
           3       0.23      0.11      0.15       594
           4       0.11      0.04      0.06       211
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1

    accuracy                           0.37      4799
   macro avg       0.21      0.20      0.19      4799
weighted avg       0.34      0.37      0.35      4799



In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 XGBoost")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 XGBoost
Train Accuracy: 0.7400885647303985
Test Accuracy : 0.40862679724942697
Inference Time (s): 0.058

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.65      0.58      1409
           1       0.37      0.37      0.37      1400
           2       0.34      0.35      0.34      1182
           3       0.26      0.14      0.19       594
           4       0.31      0.10      0.15       211
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1

    accuracy                           0.41      4799
   macro avg       0.26      0.23      0.23      4799
weighted avg       0.39      0.41      0.39      4799



second attempt

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv("/content/loan.csv")

# Safe pre-loan features
safe_features = [
    'loan_amnt', 'term',  'emp_length','int_rate', 'home_ownership',
    'annual_inc', 'verification_status', 'purpose', 'dti', 'open_acc', 'revol_util'
]

df = df.dropna(subset=['grade'])
df = df[safe_features + ['grade']]

# Numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute and encode
df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
for col in cat_cols:
    df[col] = df[col].fillna('Unknown')
    df[col] = LabelEncoder().fit_transform(df[col])

# Encode target
df['grade'] = LabelEncoder().fit_transform(df['grade'])

# Split
X = df.drop('grade', axis=1)
y = df['grade']
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 Logistic Regression")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 Logistic Regression
Train Accuracy: 0.9970476381104884
Test Accuracy : 0.9968978284799359
Inference Time (s): 0.0021

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2928
           1       1.00      1.00      1.00      2898
           2       1.00      1.00      1.00      2391
           3       0.99      0.99      0.99      1291
           4       0.96      0.97      0.97       478
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3

    accuracy                           1.00      9993
   macro avg       0.71      0.71      0.71      9993
weighted avg       1.00      1.00      1.00      9993



In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 SVM")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 SVM
Train Accuracy: 0.9957966373098479
Test Accuracy : 0.9905934153907735
Inference Time (s): 7.9058

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2928
           1       0.99      0.99      0.99      2898
           2       0.99      0.99      0.99      2391
           3       0.99      0.98      0.98      1291
           4       0.95      0.96      0.96       478
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3

    accuracy                           0.99      9993
   macro avg       0.70      0.70      0.70      9993
weighted avg       0.99      0.99      0.99      9993



In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 Random Forest")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 Random Forest
Train Accuracy: 1.0
Test Accuracy : 0.9991994396077254
Inference Time (s): 0.109

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2928
           1       1.00      1.00      1.00      2898
           2       1.00      1.00      1.00      2391
           3       1.00      1.00      1.00      1291
           4       0.99      1.00      0.99       478
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3

    accuracy                           1.00      9993
   macro avg       0.71      0.71      0.71      9993
weighted avg       1.00      1.00      1.00      9993



In [7]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 KNN")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 KNN
Train Accuracy: 0.893064451561249
Test Accuracy : 0.816871810267187
Inference Time (s): 10.0762

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90      2928
           1       0.76      0.80      0.78      2898
           2       0.77      0.78      0.77      2391
           3       0.83      0.75      0.79      1291
           4       0.94      0.74      0.83       478
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         3

    accuracy                           0.82      9993
   macro avg       0.60      0.57      0.58      9993
weighted avg       0.82      0.82      0.82      9993



In [8]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

start = time.time()
y_pred_test = model.predict(X_test)
end = time.time()

print("📌 XGBoost")
print("Train Accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Test Accuracy :", accuracy_score(y_test, y_pred_test))
print("Inference Time (s):", round(end - start, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))


📌 XGBoost
Train Accuracy: 1.0
Test Accuracy : 0.9998999299509657
Inference Time (s): 0.0285

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2928
           1       1.00      1.00      1.00      2898
           2       1.00      1.00      1.00      2391
           3       1.00      1.00      1.00      1291
           4       1.00      1.00      1.00       478
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00         3

    accuracy                           1.00      9993
   macro avg       1.00      1.00      1.00      9993
weighted avg       1.00      1.00      1.00      9993



Observation:
After removing int_rate, model performance dropped drastically.

✅ int_rate was highly predictive of loan grade (which is expected—loan grades are often assigned based on risk, and risk determines interest rate).

❌ Without it, models struggle, because other features like loan_amnt, purpose, or emp_length are not strong enough predictors alone.



Why This Happens:
loan_grade is inherently tied to int_rate—in real-world lending:

Grade is not a raw label, it's assigned based on calculated risk (credit score, income, etc.), and interest rate is a reflection of that grade.

So trying to predict grade without the int_rate is like trying to guess someone's rank without knowing their marks.