In [1]:
import pandas as pd
import time
import numpy as np

df_train = pd.read_csv('../data/processed/loan_e_p_cleaned_train.csv')
df_test = pd.read_csv('../data/processed/loan_e_p_cleaned_test.csv')

In [2]:
df_train.shape

(491, 31)

In [3]:
df_test.shape

(123, 31)

In [4]:
def format_duration(seconds):
    if seconds < 60:
        return f'{seconds:.2f}s'
    elif seconds < 3600:
        minutes = int(seconds // 60)
        sec = seconds % 60
        return f'{minutes}m : {sec:.2f}s'
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        return f'{hours}h : {minutes}m : {secs}s'

In [5]:
start = time.time()
y_train = df_train['Loan_Status']
print(y_train.shape)
x_train = df_train.drop(columns='Loan_Status').copy()
print(x_train.shape)
time_elapsed = time.time() - start
print(format_duration(time_elapsed))

(491,)
(491, 30)
0.02s


In [6]:
from sklearn.inspection import permutation_importance
import shap     
from sklearn.feature_selection import VarianceThreshold

start = time.time()
selector = VarianceThreshold(threshold=0.01)
selector.fit(x_train)
time_elapsed = time.time() - start
print(format_duration(time_elapsed))

  from .autonotebook import tqdm as notebook_tqdm


0.01s


In [7]:
y_test = df_test['Loan_Status']
print(y_test.shape)
x_test = df_test.drop(columns='Loan_Status').copy()
print(x_test.shape)

(123,)
(123, 30)


In [8]:
selected_mask = selector.get_support()
selected_cols = x_train.columns[selected_mask].tolist()
x_train = x_train[selected_cols]
x_test = x_test[selected_cols]
print(x_train.shape)
print(x_test.shape)

(491, 27)
(123, 27)


In [9]:
import numpy as np
corr_matrix = x_train.corr(method='spearman')
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > 0.95)]
x_train.drop(columns=to_drop, inplace=True)
x_test.drop(columns=to_drop, inplace=True)
print(x_train.shape)
print(x_test.shape)

(491, 18)
(123, 18)


In [10]:
# train baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [11]:
print('n_of_features : ', len(x_train.columns))
print('feauture_names : ', x_train.columns.tolist())
print('Train class dist : ', np.bincount(y_train.astype(int).tolist()))
print('Test class dist : ',np.bincount(y_test.astype(int).tolist()))

n_of_features :  18
feauture_names :  ['Dependents', 'Applicant_Income', 'Coapplicant_Income', 'Loan_Amount', 'Loan_Amount_Term', 'Credit_History', 'Total_Income', 'Income_Ratio', 'Income_per_Dependent', 'Applicant_Income Loan_Amount', 'EMI', 'Has_Coapplicant', 'Married_Yes', 'Education_Not Graduate', 'Gender_Male', 'Self_Employed_Yes', 'Property_Area_Semiurban', 'Property_Area_Urban']
Train class dist :  [154 337]
Test class dist :  [38 85]


In [None]:
model_map = {
    'LogisticRegression': LogisticRegression(random_state=42,
                                    max_iter=1000, 
                                    solver='lbfgs',
                                    class_weight='balanced'),

    'RandomForest':RandomForestClassifier(random_state=42,
                                        class_weight='balanced',
                                        n_estimators=100,
                                        n_jobs=-1),
                                        
    'XGBoost': XGBClassifier(random_state=42, 
                            scale_pos_weight=2.14,
                            n_estimators=100, 
                            learning_rate=0.1,
                            n_jobs=-1,
                            eval_metric='logloss')
}
cv_results = {}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for model_name, model in model_map.items():
    start_time = time.time()

    cv_scores = cross_val_score(
        estimator=model,
        X=x_train,
        y=y_train,
        scoring='f1',
        cv=cv,
        n_jobs=-1
    )

    cv_results[model_name] = {
        'cv_mean' : cv_scores.mean(),
        'cv_std' : cv_scores.std(),
        'cv_min' : cv_scores.min(),
        'cv_max' : cv_scores.max(),
        'cv_scores' : cv_scores.tolist()
    }
    end_time = time.time() - start_time
    

for name,result in cv_results.items():
    print(f'Model name: {name}')
    print(f'='*30)
    print(f'Mean score: {result['cv_mean']}')
    print(f'std score: ',result['cv_std'])
    print(f'Min score: {result['cv_min']}')
    print(f'Max score: {result['cv_max']}')
    print(f'cv scores: {result['cv_scores']}')
    print(f'='*50)

print(format_duration(end_time))