In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

In [2]:
raw_dataset = pd.read_excel("../data/DBF.xlsx")
raw_dataset.head()

Unnamed: 0,loan_amnt,term,int_rate,grade,emp_length,home_ownership,annual_inc,loan_status,purpose,dti,T
0,12800,36,0.1199,C,6,MORTGAGE,53000.0,1,credit_card,0.2099,1095
1,5625,36,0.1825,D,10,MORTGAGE,53328.0,1,home_improvement,0.1051,1095
2,12025,36,0.1446,C,2,MORTGAGE,32000.0,1,debt_consolidation,0.1988,1095
3,9000,36,0.1531,C,9,RENT,52000.0,1,debt_consolidation,0.1886,1095
4,8000,36,0.1075,B,5,RENT,73000.0,1,debt_consolidation,0.1085,1095


In [3]:
dataset = raw_dataset.copy(deep=True)

In [4]:
# Log transformation for 'annual_inc' to reduce skewness ('log1p' is more stable than 'log').
dataset['annual_inc_log'] = np.log1p(dataset['annual_inc'])
dataset = dataset.drop(['annual_inc'], axis=1)

In [5]:
# Replace negative 'dti' values with zero.
dataset['dti'].apply(lambda x: max(x, 0))

0         0.2099
1         0.1051
2         0.1988
3         0.1886
4         0.1085
           ...  
527423    0.0938
527424    0.1320
527425    0.1480
527426    0.1152
527427    0.2095
Name: dti, Length: 527428, dtype: float64

In [6]:
# Target encoding for 'purpose'
dataset['purpose_encoded'] = dataset.groupby('purpose')['loan_status'].transform('mean')
dataset = dataset.drop(['purpose'], axis=1)

In [7]:
# Features ('term' can be converted into a categorical variable)
numerical_features = ['loan_amnt', 'term', 'int_rate', 'emp_length', 'annual_inc_log', 'dti', 'T']
one_hot_features = ['grade', 'home_ownership'] # TODO: add 'term' to the 'one_hot_features'

In [8]:
X = dataset.drop(['loan_status'], axis=1)
y = dataset['loan_status']

In [9]:
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

### Preprocessing pipelines:

In [11]:
numerical_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
categorical_pipeline = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, one_hot_features)
    ])

### Model Pipeline:
#### 1. Random Forest Classifier

In [13]:
rf_model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

### Train the model:

In [14]:
rf_model_pipeline.fit(X_train, y_train)

### Make predictions:

In [15]:
y_pred = rf_model_pipeline.predict(X_test)

### Evaluate the model:

In [16]:
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.65      0.63     26778
           1       0.63      0.59      0.61     27056

    accuracy                           0.62     53834
   macro avg       0.62      0.62      0.62     53834
weighted avg       0.62      0.62      0.62     53834

Accuracy: 0.6238436675706802


### Hyperparameter tuning:

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30]
}

In [18]:
grid_search = GridSearchCV(rf_model_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

KeyboardInterrupt: 

#### 2. XGBoost Classifier:

In [None]:
import xgboost as xgb

xgb_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

In [None]:
# Hyperparameter tuning for XGBoost
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3]
}

grid_search = GridSearchCV(xgb_model_pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Train the best model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

#### 3. Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression

# Define the model pipeline
lr_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Train and evaluate the model
lr_model_pipeline.fit(X_train, y_train)
y_pred_lr = lr_model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


#### 4. LightGBM Classifier:

In [None]:
import lightgbm as lgb

# Define the model pipeline
lgb_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])

# Train and evaluate the model
lgb_model_pipeline.fit(X_train, y_train)
y_pred_lgb = lgb_model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_lgb))
print("Accuracy:", accuracy_score(y_test, y_pred_lgb))


#### 5. CatBoost Classifier:

In [None]:
from catboost import CatBoostClassifier

# Define the model pipeline
cb_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(random_state=42, verbose=0))
])

# Train and evaluate the model
cb_model_pipeline.fit(X_train, y_train)
y_pred_catboost = cb_model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_catboost))
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))


**