# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read data

In [3]:
data = pd.read_csv('../data/train.csv')
data_to_predict = pd.read_csv('../data/test.csv')
data_ccrisk = pd.read_csv('../data/credit_risk_dataset.csv')

data_ccrisk.dropna(inplace=True)
data_ccrisk.drop_duplicates(inplace=True)

X_old = data.drop(['loan_status'], axis=1)
y_old = data['loan_status']

# Merge the dataframes
data_no_id = data.drop(['id'], axis=1)
merged_data = pd.concat([data_no_id, data_ccrisk], ignore_index=True)

def remove_col_prefix(dataframe, prefix):
	dataframe.columns = dataframe.columns.str.replace(prefix, '')
	return dataframe

merged_data = remove_col_prefix(merged_data, 'person_')
merged_data = remove_col_prefix(merged_data, 'loan_')


print(merged_data.isnull().sum())
print(merged_data.duplicated().sum())

X = merged_data.drop(['status'], axis=1)
y = merged_data['status']

age                    0
income                 0
home_ownership         0
emp_length             0
intent                 0
grade                  0
amnt                   0
int_rate               0
percent_income         0
cb_default_on_file     0
cb_cred_hist_length    0
status                 0
dtype: int64
0


# Feature engineering

In [4]:
X['emp_length_to_age_ratio'] = X['emp_length'] / X['age']
X['income_to_age_ratio'] = X['income'] / X['age']
X['total_income_to_age_ratio'] = X['emp_length']*X['income'] / X['age']
new_features = ['emp_length_to_age_ratio', 'income_to_age_ratio', 'total_income_to_age_ratio']

merged_data['emp_length_to_age_ratio'] = merged_data['emp_length']/merged_data['age']
merged_data['income_to_age_ratio'] = merged_data['income']/merged_data['age']
merged_data['total_income_to_age_ratio'] = merged_data['emp_length']*merged_data['income']/merged_data['age']

# Preprocessing

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=2024)

In [7]:
categorical_columns = X.select_dtypes(include=['object']).columns

categorical_ordinal = ['grade']
categorical_onehot = categorical_columns.drop(categorical_ordinal)

log_columns = ['age', 'income', 'emp_length', 'amnt']
log_columns = log_columns + new_features

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop(log_columns)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

def log_transform():
	return FunctionTransformer(np.log1p, validate=True)

preprocessor = ColumnTransformer(
	transformers=[
		('ordinal', OrdinalEncoder(), categorical_ordinal),
		('onehot', OneHotEncoder(), categorical_onehot),
		('scaler', StandardScaler(), numerical_columns),
		('log', log_transform(), log_columns)
	], remainder='passthrough'
	)

preprocessor.fit(X_train)

X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)
data_prep = preprocessor.transform(merged_data)
X_all_prep = preprocessor.transform(X)

In [9]:
X_train_prep = pd.DataFrame(X_train_prep, columns=categorical_ordinal + list(preprocessor.named_transformers_['onehot'].get_feature_names_out()) + list(numerical_columns) + log_columns)
X_test_prep = pd.DataFrame(X_test_prep, columns=categorical_ordinal + list(preprocessor.named_transformers_['onehot'].get_feature_names_out()) + list(numerical_columns) + log_columns)
data_prep = pd.DataFrame(data_prep, columns=categorical_ordinal + list(preprocessor.named_transformers_['onehot'].get_feature_names_out()) + list(numerical_columns) + log_columns)

In [10]:
data = pd.concat([data_prep, merged_data['status']], axis=1)

# Models

In [11]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [28]:
xgb_params = [
	{    
		'objective': 'binary:logistic',
		'learning_rate'         : 0.03, 
		'n_estimators'          : 5000,
		'max_depth'             : 7, 
		'colsample_bytree'      : 0.75, 
		'colsample_bynode'      : 0.85,
		'colsample_bylevel'     : 0.45,                     
		'reg_alpha'             : 0.001, 
		'reg_lambda'            : 0.25,
		'enable_categorical'    : True,
		'random_state'          : 42,
	},
	{
	'objective': 'binary:logistic',
	'eval_metric': 'auc',
	'n_estimators' : 5000,
	'max_depth': 7,
	'eta': 0.08,
	'reg_lambda': 38.5,
	'subsample': 0.88,
	'colsample_bytree': 0.65,
	'enable_categorical': True,
	'min_child_weight': 5,
	},
]

lgb_params = [
	{
		'objective': 'binary', 
		'metric':    'auc',
		'learning_rate'       : 0.0325, 
		'n_estimators'        : 5_000,
		'max_depth'           : 7, 
		'num_leaves'          : 25, 
		'min_data_in_leaf'    : 20,
		'feature_fraction'    : 0.70, 
		'bagging_fraction'    : 0.88, 
		'bagging_freq'        : 6, 
		'lambda_l1'           : 0.001, 
		'lambda_l2'           : 0.1,
	},
	{
		'objective': 'binary', 
		'metric':    'auc',
		'learning_rate'       : 0.035,
		'data_sample_strategy': 'goss', 
		'n_estimators'        : 5_000,
		'max_depth'           : 7, 
		'num_leaves'          : 30, 
		'min_data_in_leaf'    : 30,
		'feature_fraction'    : 0.60, 
		'colsample_bytree'    : 0.65,
		'lambda_l1'           : 0.001, 
		'lambda_l2'           : 1.25,  
	},
	{
		'objective': 'binary',
		'metric': 'auc',
		'max_depth': -1,
		'verbosity': -1,
		'n_estimators': 1500,
		'max_bin': 1024,
		'boosting_type': 'gbdt',
		'colsample_bytree': 0.57,        
		'eta': 0.055,
		'reg_lambda': 10.8,
		'min_child_samples': 70,
	},
]

cat_params = [
	{
		'task_type'           : "CPU",
		'loss_function'       : 'Logloss',
		'eval_metric'         : "AUC",
		'bagging_temperature' : 0.25,
		'colsample_bylevel'   : 0.40,
		'iterations'          : 5_000,
		'learning_rate'       : 0.045,
		'max_depth'           : 7,
		'l2_leaf_reg'         : 0.80,
		'min_data_in_leaf'    : 30,
		'random_strength'     : 0.25,
		'random_state': 42,
	},
	{
		'iterations': 1500,
		'depth': 6,
		'eta': 0.3, 
		'reg_lambda': 41.0, 
		'loss_function': 'Logloss',
		'eval_metric': 'AUC',
		'random_state': 42,
		'min_data_in_leaf': 51,
		'early_stopping_rounds': 150,
		'verbose':200,
	},
	{
		'iterations': 3000,
		'depth': 7,
		'eta': 0.3, 
		'reg_lambda': 40.0, 
		'loss_function': 'Logloss',
		'eval_metric': 'AUC',
		'random_state': 42,
		'min_data_in_leaf': 51,
		'early_stopping_rounds': 300,
		'verbose':200,
	},
	{
		'task_type'           : "CPU",
		'loss_function'       : 'Logloss',
		'eval_metric'         : "AUC",
		'bagging_temperature' : 0.25,
		'colsample_bylevel'   : 0.40,
		'iterations'          : 7_000,
		'learning_rate'       : 0.035,
		'max_depth'           : 8,
		'l2_leaf_reg'         : 0.80,
		'min_data_in_leaf'    : 30,
		'random_strength'     : 0.25,
		'early_stopping_rounds': 200,
		'random_state': 42,
	},
]

In [30]:
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
	# ('xgb0', xgb.XGBClassifier(**xgb_params[0])),
	# ('lgb0', lgb.LGBMClassifier(**lgb_params[0])),
	('lgb0', lgb.LGBMClassifier(**lgb_params[0])),
	('lgb2', lgb.LGBMClassifier(**lgb_params[2])),
	('cat0', CatBoostClassifier(**cat_params[0])),
	('cat1', CatBoostClassifier(**cat_params[1])),
	('cat2', CatBoostClassifier(**cat_params[2])),
	('cat3', CatBoostClassifier(**cat_params[3])),
]

stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, n_jobs=-1)

In [31]:
stacking.fit(X_train_prep, y_train)

In [32]:
from sklearn.metrics import roc_auc_score
y_probas_stacking = stacking.predict_proba(X_test_prep)[:, 1]

print(roc_auc_score(y_test, y_probas_stacking))

0.9600789259269291
