# Importing Libraries

In [1]:
import os
import logging
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import roc_auc_score
import joblib
from scipy.optimize import minimize

random_state = 68

In [None]:
# Set this path to the directory of your data file.
path = "/kaggle/input/loan-approval-prediction-pre/"

`cat_enc`: Encodes categorical variables using either One Hot Encoding or Ordinal Encoding.

`lir_impute`: Uses Linear Regression to impute missing values.

`create_var`: Creates new features that are hand-made.

`feature_engineering`: Combines above 3 functions to perform feature engineering on dataset.


Creating a seperate function for feature engineering is important as this feature engineering includes imputation of data, and therefore, it should be done after cv is splitted to reduce contamination of cv data.

In [6]:
def cat_enc(df, ohe, ordi, cats):
    other_var = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income", 
                 "cb_person_cred_hist_length", "Type", "loan_status"]
    df_cat1 = pd.DataFrame(ohe.transform(df[cats[0]]), columns = [x for xs in ohe.categories_ for x in xs]).reset_index(drop=True)
    df_cat2 = pd.DataFrame(ordi.transform(df[cats[1]]), columns = cats[1]).reset_index(drop=True)
    df_cat = pd.concat([df_cat1, df_cat2], axis=1)
    df_enc = pd.concat([df[other_var].reset_index(drop=True),df_cat], axis=1)
    return df_enc

def lir_impute(lr, df_nmv, df_mv, features):
    df_mv['loan_int_rate'] = lr.predict(df_mv[features])
    return pd.concat([df_nmv, df_mv]).reset_index(drop=True)

def create_var(data, aita, aite):
    df = data.copy()
    df['loan_int_rate'] = df['loan_int_rate']/100
    df['loan_int_rate_month'] = (1 + df['loan_int_rate'])**(1/12) - 1
    k =  1 + df['loan_int_rate_month']
    P = df['person_income']*df['loan_percent_income']/12
    L = df['loan_amnt']
    tmp = (np.log(P/(P-L*(k-1)))/np.log(k)).replace([np.inf,-np.inf], np.nan)
    df['est_payback_time_in_month'] = tmp.fillna(-999)
    df['avg_income_by_age'] = [aita[x] if x in aita.index else aita[aita.index[abs(aita.index - x).argmin()]] for x in df["person_age"]]
    df['avg_income_by_emp'] = [aite[x] if x in aite.index else aite[aite.index[abs(aite.index - x).argmin()]] for x in df["person_age"]]
    df['income_diff_avg_age'] = df['person_income'] - df['avg_income_by_age']
    df['income_diff_avg_emp'] = df['person_income'] - df['avg_income_by_emp']
    df['risk_flag'] = ((df['Y'] == 1) & (df['loan_grade'].replace(["D","E","F","G"], "low") == "low")).astype("int")
    return df
    
def feature_engineering(data, data_cv, impute=False, min_max = False):
    df = data.copy()
    df_cv = data_cv.copy()
    
    # Step 1: Encoding Categorical Variables
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ordi = OrdinalEncoder(handle_unknown='error')
    cats = [['person_home_ownership','loan_intent','cb_person_default_on_file'], ['loan_grade']]
    ohe.fit(df[cats[0]])
    ordi.fit(df[cats[1]])
    df_enc = cat_enc(df, ohe, ordi, cats)
    df_cv_enc = cat_enc(df_cv, ohe, ordi, cats)
    
    if impute:
        # Step 2.1: Imputing Missing Values - `loan_int_rate`
        df_mv = df_enc[(df_enc.isna()["loan_int_rate"])]
        df_nmv = df_enc[~(df_enc.isna()["loan_int_rate"])]
        features_infer = df_nmv.columns[~((df_nmv.columns == 'person_emp_length') | (df_nmv.columns == 'loan_int_rate') | (df_nmv.columns == 'loan_status') | (df_nmv.columns == 'Type'))]
        df_nmv_train, df_nmv_infer = df_nmv[features_infer], df_nmv['loan_int_rate']
        lr = LinearRegression().fit(df_nmv_train, df_nmv_infer)
        df_enc = lir_impute(lr, df_nmv, df_mv, features_infer)
        
        # Step 2.2: Imputing Missing Values - `person_emp_length`
        df_mv = df_enc[(df_enc.isna()["person_emp_length"])]
        df_nmv = df_enc[~(df_enc.isna()["person_emp_length"])]
        diff_scores = [2.205876158220027e-52, 2.8034878798264012e-36, 7.216755273392921e-12]
        pel_u = []
        for col in ['person_income', 'loan_amnt', 'loan_percent_income']:
            mean = np.mean(df_mv[col])
            std = np.std(df_mv[col])
            d = df_nmv["person_emp_length"][(df_nmv[col] > mean-3*std) & (df_nmv[col] < mean+3*std)]
            pel_u.append(np.mean(d))
        pel_weights = np.log(1/np.array(diff_scores)) # Stronger weights for variables with higher confidence + Softened
        pel_weights = pel_weights / np.sum(pel_weights)
        pel_mean = np.sum(pel_u*pel_weights)
        df_enc['person_emp_length'] = df_enc['person_emp_length'].fillna(pel_mean)
    
    # Step 3: Creating New Variables
    aita = df_enc.groupby("person_age")["person_income"].mean()
    aite = df_enc.groupby("person_emp_length")["person_income"].mean()
    df_fe = create_var(df_enc, aita, aite)
    df_cv_fe = create_var(df_cv_enc, aita, aite)
    
    # Step 4: Scale Normalizing with Scaler
    train_cols = df_fe.columns[[x not in ['loan_status'] for x in df_fe.columns]]
    Xdf_fe = df_fe[train_cols]
    ydf_fe = df_fe['loan_status']
    Xdf_cv_fe = df_cv_fe[train_cols]
    ydf_cv_fe = df_cv_fe['loan_status']
    if min_max:
        scaler = MinMaxScaler().fit(Xdf_fe)
    else:
        scaler = StandardScaler().fit(Xdf_fe)
    cols = Xdf_fe.columns
    Xdf_fe = pd.DataFrame(scaler.transform(Xdf_fe), columns = cols)
    Xdf_cv_fe = pd.DataFrame(scaler.transform(Xdf_cv_fe), columns = cols)
    
    return [Xdf_fe, ydf_fe, Xdf_cv_fe, ydf_cv_fe]

`model_pred`: Used to return y_pred given Xtest data and the Model.

In [7]:
def model_pred(models, Xtest):
    y_pred = []
    for model in models:
        y_pred.append(model.predict_proba(Xtest)[:,1])
    y_pred = np.array(y_pred).sum(axis=0)/len(models)
    return y_pred

# Preparing Xtest for Different Models

In [8]:
Xtrain_un = pd.read_csv(path + "/trains.csv")
Xtrain_mr = pd.read_csv(path + "/merge_rmvd.csv")
Xtrain_md = pd.read_csv(path + "/merged.csv")
Xtest = pd.read_csv(path + "/cv.csv")
_,_,Xtest_un, ytest_un = feature_engineering(Xtrain_un, Xtest.copy())
_,_,Xtest_mr, ytest_mr = feature_engineering(Xtrain_mr, Xtest.copy())
_,_,Xtest_md, ytest_md = feature_engineering(Xtrain_md, Xtest.copy(), impute=True)

Since there are 3 different ways of creating a dataset, which includes different imputation etc., 3 different cross validation set has to be created.

These created datasets will be stored as a list in `Xtest`.

Since target value of cross validation is shared, only one copy of `ytest` will exist.

In [9]:
Xtest = [Xtest_un, Xtest_mr, Xtest_md]
ytest = ytest_un

# Loading Models

The best models found in `lap_lgbm_models` and `lap_xgb_models` iPython Notebook will be loaded in this dataset to compare their individual performance against performance of aggregated models.

## XGB Models

In [10]:
xgb_model = []
path_xgb = path + "xgb_result/"
name = ['xgb11', 'xgb21', 'xgb31']
for i in range(3):
    m = []
    for j in range(5):
        n = path_xgb + name[i] + "_" + str(j) + ".json"
        model = XGBClassifier(random_state = random_state)
        model.load_model(n)
        m.append(model)
    xgb_model.append(m)

## LGBM Models

In [11]:
lgbm_model = []
path_lgbm = path + "lgbm_result/"
name = ['lgbm11', 'lgbm21', 'lgbm31']
for i in range(3):
    m = []
    for j in range(5):
        n = path_lgbm + name[i] + "_" + str(j) + ".pkl"
        model = joblib.load(n)
        m.append(model)
    lgbm_model.append(m)

## All Models

In [12]:
models = xgb_model + lgbm_model
match_test = [0,1,2,0,1,2] # Match the Xtest data to the model.
y_pred = []
for i, mod in enumerate(models):
    pred = model_pred(mod, Xtest[match_test[i]])
    y_pred.append(pred)

# Model Aggregation
Model Naming Convention:
- Given 6 models [`xgb11`,`xgb12`,`xgb31`,`lgbm11`,`lgbm21`,1lgbm31`], models will be named by 6 digits.
- 1 will indicate that model was used in aggregation, 0 will indicate it was not used.
- E.g., 000110 means only `lgbm11` and `lgbm21` is used to create the corresponding aggregation.

## Aggregation Technique 1: Stack

Simple stacking method - that is, simple averaging method - of 2 or more models will be explored.

In [13]:
sn, ss = [], []
def stack_rec(add, counter, kept, name, y_pred, ytest, stack_name, stack_score):
    if counter >= 6:
        if sum(kept) == 0:
            return
        y_hat = (np.array(y_pred)[kept]).sum(axis=0) / sum(kept)
        stack_score.append(roc_auc_score(ytest, y_hat))
        stack_name.append(name)
        return
    else:
        kept.append(add)
        name += str(int(add))
        counter += 1
        stack_rec(True, counter, kept[:], name, y_pred, ytest, stack_name, stack_score)
        stack_rec(False, counter, kept[:], name, y_pred, ytest, stack_name, stack_score)
def help_stack_rec(sn, ss):
    stack_rec(True, 0, [], "", y_pred, ytest, sn, ss)
    stack_rec(False, 0, [], "", y_pred, ytest, sn, ss)
    return sn, ss
sn, ss = help_stack_rec(sn,ss)

In [14]:
stack_out = pd.DataFrame([sn,ss]).T.drop_duplicates()
stack_out.columns = ["Model", "Score"]
display(stack_out.sort_values(by="Score", ascending=False).head())

Unnamed: 0,Model,Score
122,10,0.958273
114,110,0.958115
112,111,0.958035
82,10110,0.957969
80,10111,0.957946


The Best Model is given with using a single model `lgbm21`.

## Aggregation Technique 2: Optimize Through Model/Function

Logistic Regression, 

Structured dataset eases the process of training or fitting a model or function.

`Dpred` is created for such purposes.

In [15]:
Dpred = pd.DataFrame(y_pred).T
cols = ['x1','x2','x3','l1','l2','l3']
Dpred.columns = cols
Dpred['loan_status'] = ytest
display(Dpred.head())

Unnamed: 0,x1,x2,x3,l1,l2,l3,loan_status
0,0.943298,0.960639,0.950607,0.93112,0.933348,0.961762,1
1,0.017766,0.021783,0.031836,0.019813,0.017364,0.010938,0
2,0.003879,0.002322,0.005574,0.004784,0.002753,0.002365,0
3,0.07155,0.069069,0.060546,0.082312,0.075299,0.063109,0
4,0.039824,0.027553,0.044944,0.033932,0.023897,0.025209,0


### Logistic Regression

In [16]:
ln, ls = [], []
def log_rec(add, counter, kept, name, y_pred, ytest, log_name, log_score):
    if counter >= 6:
        if sum(kept) == 0:
            return
        Xpred = pd.DataFrame(np.array(y_pred)[kept]).T
        lgreg = LogisticRegression(random_state=random_state).fit(Xpred, ytest)
        y_hat = lgreg.predict_proba(Xpred)[:,1]
        log_score.append(roc_auc_score(ytest, y_hat))
        log_name.append(name)
        return
    else:
        kept.append(add)
        name += str(int(add))
        counter += 1
        log_rec(True, counter, kept[:], name, y_pred, ytest, log_name, log_score)
        log_rec(False, counter, kept[:], name, y_pred, ytest, log_name, log_score)
def help_log_rec(ln, ls):
    log_rec(True, 0, [], "", y_pred, ytest, ln, ls)
    log_rec(False, 0, [], "", y_pred, ytest, ln, ls)
    return ln, ls
ln, ls = help_log_rec(ln,ls)

In [17]:
log_out = pd.DataFrame([ln,ls]).T.drop_duplicates()
log_out.columns=["Model", "Score"]
display(log_out.sort_values(by="Score", ascending=False).head())

Unnamed: 0,Model,Score
114,110,0.958365
106,1010,0.958289
122,10,0.958273
112,111,0.958256
96,1111,0.958226


The Best Model is given with using aggregation of `lgbm11` and `lgbm21`, or `xgb31` and `lgbm21`.

### Weighted Sum Optimization

In [18]:
on, os = [], []
def opt_rec(add, counter, kept, name, y_pred, ytest, opt_name, opt_score):
    if counter >= 6:
        if sum(kept) == 0:
            return
        Xpred = pd.DataFrame(np.array(y_pred)[kept]).T
        def loss_func(w):
            res = np.dot(Xpred, w) / np.sum(w)
            return roc_auc_score(ytest, res)
        w = np.random.randn(Xpred.shape[1])
        w = minimize(loss_func, w, tol=1e-12)['x']
        y_hat = np.matmul(Xpred, w) / np.sum(w)
        opt_score.append(roc_auc_score(ytest, y_hat))
        opt_name.append(name)
        return
    else:
        kept.append(add)
        name += str(int(add))
        counter += 1
        opt_rec(True, counter, kept[:], name, y_pred, ytest, opt_name, opt_score)
        opt_rec(False, counter, kept[:], name, y_pred, ytest, opt_name, opt_score)
def help_opt_rec(on, os):
    opt_rec(True, 0, [], "", y_pred, ytest, on, os)
    opt_rec(False, 0, [], "", y_pred, ytest, on, os)
    return on, os
on, os = help_opt_rec(on,os)

In [19]:
opt_out = pd.DataFrame([on,os]).T.drop_duplicates()
opt_out.columns=["Model", "Score"]
display(opt_out.sort_values(by="Score", ascending=False).head())

Unnamed: 0,Model,Score
115,110,0.958462
122,10,0.958273
114,110,0.958258
81,10111,0.957981
120,11,0.957624


The Best Model is given with using the aggregation of  `xgb31` and `lgbm21`, or `xgb11` and `lgbm21`.

# Final Submission

The best 5 models are used to make predictions on the preserved test set. After submission, internal score is given by kaggle. These were the result.

1. Using `lgbm11` only - Public Score (Train Score): 0.96650, Private Score (Test Score): 0.96226.
2. Usng `lgbm11`+`lgbm21` and Logistic Regression - Public Score (Train Score): 0.96700, Private Score (Test Score): 0.96240.
3. Using `xgb31`+`lgbm21` and Logistic Regression - Public Score (Train Score): 0.96654, Private Score (Test Score): 0.96242.
4. Using `xgb31`+`lgbm21` and Weighted Sum Opt. - Public Score (Train Score): 0.96612, Private Score (Test Score): 0.96219.
5. Using `xgb11`+`lgbm21` and Weighted Sum Opt. - Public Score (Train Score): 0.96606, Private Score (Test Score): 0.96215.

In [20]:
fin_test = pd.read_csv(path + "/test.csv")
display(fin_test.head())

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [21]:
def cat_enc_y(df, ohe, ordi, cats):
    other_var = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income", 
                 "cb_person_cred_hist_length", "id"]
    df_cat1 = pd.DataFrame(ohe.transform(df[cats[0]]), columns = [x for xs in ohe.categories_ for x in xs]).reset_index(drop=True)
    df_cat2 = pd.DataFrame(ordi.transform(df[cats[1]]), columns = cats[1]).reset_index(drop=True)
    df_cat = pd.concat([df_cat1, df_cat2], axis=1)
    df_enc = pd.concat([df[other_var].reset_index(drop=True),df_cat], axis=1)
    return df_enc

def lir_impute(lr, df_nmv, df_mv, features):
    df_mv['loan_int_rate'] = lr.predict(df_mv[features])
    return pd.concat([df_nmv, df_mv]).reset_index(drop=True)

def create_var_y(data, aita, aite):
    df = data.copy()
    df['loan_int_rate'] = df['loan_int_rate']/100
    df['loan_int_rate_month'] = (1 + df['loan_int_rate'])**(1/12) - 1
    k =  1 + df['loan_int_rate_month']
    P = df['person_income']*df['loan_percent_income']/12
    L = df['loan_amnt']
    tmp = (np.log(P/(P-L*(k-1)))/np.log(k)).replace([np.inf,-np.inf], np.nan)
    df['est_payback_time_in_month'] = tmp.fillna(-999)
    df['avg_income_by_age'] = [aita[x] if x in aita.index else aita[aita.index[abs(aita.index - x).argmin()]] for x in df["person_age"]]
    df['avg_income_by_emp'] = [aite[x] if x in aite.index else aite[aite.index[abs(aite.index - x).argmin()]] for x in df["person_age"]]
    df['income_diff_avg_age'] = df['person_income'] - df['avg_income_by_age']
    df['income_diff_avg_emp'] = df['person_income'] - df['avg_income_by_emp']
    df['risk_flag'] = ((df['Y'] == 1) & (df['loan_grade'].replace(["D","E","F","G"], "low") == "low")).astype("int")
    return df
    
def feature_engineering_y(data, data_cv, impute=False, min_max = False):
    df = data.copy()
    df_cv = data_cv.copy()
    
    # Step 1: Encoding Categorical Variables
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ordi = OrdinalEncoder(handle_unknown='error')
    cats = [['person_home_ownership','loan_intent','cb_person_default_on_file'], ['loan_grade']]
    ohe.fit(df[cats[0]])
    ordi.fit(df[cats[1]])
    df_enc = cat_enc(df, ohe, ordi, cats)
    df_cv_enc = cat_enc_y(df_cv, ohe, ordi, cats)
    
    if impute:
        # Step 2.1: Imputing Missing Values - `loan_int_rate`
        df_mv = df_enc[(df_enc.isna()["loan_int_rate"])]
        df_nmv = df_enc[~(df_enc.isna()["loan_int_rate"])]
        df_cv_mv = df_cv_enc[(df_cv_enc.isna()["loan_int_rate"])]
        df_cv_nmv = df_cv_enc[~(df_cv_enc.isna()["loan_int_rate"])]
        features_infer = df_nmv.columns[~((df_nmv.columns == 'person_emp_length') | (df_nmv.columns == 'loan_int_rate') | (df_nmv.columns == 'loan_status') | (df_nmv.columns == 'Type'))]
        df_nmv_train, df_nmv_infer = df_nmv[features_infer], df_nmv['loan_int_rate']
        lr = LinearRegression().fit(df_nmv_train, df_nmv_infer)
        df_enc = lir_impute(lr, df_nmv, df_mv, features_infer)
        
        # Step 2.2: Imputing Missing Values - `person_emp_length`
        df_mv = df_enc[(df_enc.isna()["person_emp_length"])]
        df_nmv = df_enc[~(df_enc.isna()["person_emp_length"])]
        diff_scores = [2.205876158220027e-52, 2.8034878798264012e-36, 7.216755273392921e-12]
        pel_u = []
        for col in ['person_income', 'loan_amnt', 'loan_percent_income']:
            mean = np.mean(df_mv[col])
            std = np.std(df_mv[col])
            d = df_nmv["person_emp_length"][(df_nmv[col] > mean-3*std) & (df_nmv[col] < mean+3*std)]
            pel_u.append(np.mean(d))
        pel_weights = np.log(1/np.array(diff_scores)) # Stronger weights for variables with higher confidence + Softened
        pel_weights = pel_weights / np.sum(pel_weights)
        pel_mean = np.sum(pel_u*pel_weights)
        df_enc['person_emp_length'] = df_enc['person_emp_length'].fillna(pel_mean)
        df_cv_enc['person_emp_length'] = df_cv_enc['person_emp_length'].fillna(pel_mean)
    
    # Step 3: Creating New Variables
    aita = df_enc.groupby("person_age")["person_income"].mean()
    aite = df_enc.groupby("person_emp_length")["person_income"].mean()
    df_fe = create_var_y(df_enc, aita, aite)
    df_cv_fe = create_var_y(df_cv_enc, aita, aite)
    
    # Step 4: Scale Normalizing with Scaler
    train_cols = df_fe.columns[[x not in ['loan_status'] for x in df_fe.columns]]
    Xdf_fe = df_fe[train_cols]
    ydf_fe = df_fe['loan_status']
    fin_cols = df_cv_fe.columns[[x not in ['id'] for x in df_cv_fe.columns]]
    Xdf_cv_fe = df_cv_fe[fin_cols]
    Xdf_cv_fe["Type"] = 1
    Xdf_cv_fe = Xdf_cv_fe[train_cols]
    id_cv_fe = df_cv_fe['id']
    if min_max:
        scaler = MinMaxScaler().fit(Xdf_fe)
    else:
        scaler = StandardScaler().fit(Xdf_fe)
    cols = Xdf_fe.columns
    Xdf_fe = pd.DataFrame(scaler.transform(Xdf_fe), columns = cols)
    Xdf_cv_fe = pd.DataFrame(scaler.transform(Xdf_cv_fe), columns = cols)
    
    return [Xdf_fe, ydf_fe, Xdf_cv_fe, id_cv_fe]

In [22]:
Xtrain_un = pd.read_csv("/kaggle/input/loan-approval-prediction-pre/train.csv")
Xtrain_mr = pd.read_csv("/kaggle/input/loan-approval-prediction-pre/merge_rmvd.csv")
Xtrain_md = pd.read_csv("/kaggle/input/loan-approval-prediction-pre/merged.csv")
_,_,Xtest_un, idtest_un = feature_engineering_y(Xtrain_un, fin_test.copy())
_,_,Xtest_mr, idtest_mr = feature_engineering_y(Xtrain_mr, fin_test.copy())
_,_,Xtest_md, idtest_md = feature_engineering_y(Xtrain_md, fin_test.copy(), impute=True)

## 1. `lgbm21`

In [23]:
fin_pred = model_pred(lgbm_model[1], Xtest_mr)
submission = pd.DataFrame([idtest_mr, fin_pred]).T
submission.columns = ['id','loan_status']
submission['id'] = submission['id'].astype("int")
display(submission.head())
print(submission.shape)

Unnamed: 0,id,loan_status
0,58645,0.99864
1,58646,0.012747
2,58647,0.558178
3,58648,0.006245
4,58649,0.047903


(39098, 2)


In [24]:
submission.to_csv("/kaggle/working/submission.csv", index=False)

## 2. `lgbm11` + `lgbm21` + Logistic Regression

In [25]:
Xpred = pd.DataFrame(y_pred[3:5]).T
fin_pred21 = model_pred(lgbm_model[0], Xtest_un)
fin_pred22 = model_pred(lgbm_model[1], Xtest_mr)
Xfin = pd.DataFrame([fin_pred21, fin_pred22]).T
lreg = LogisticRegression(random_state = random_state).fit(Xpred, ytest)
submission2 = lreg.predict_proba(Xfin)[:,1]
submission2 = pd.DataFrame([idtest_mr, submission2]).T
submission2.columns = ['id','loan_status']
submission2['id'] = submission2['id'].astype("int")
display(submission2.head())
print(submission2.shape)

Unnamed: 0,id,loan_status
0,58645,0.969551
1,58646,0.027374
2,58647,0.666359
3,58648,0.026029
4,58649,0.034279


(39098, 2)


In [26]:
submission2.to_csv("/kaggle/working/submission2.csv", index=False)

## 3. `xgb31` + `lgbm21` + Logistic Regression

In [27]:
Xpred = pd.DataFrame([y_pred[2],y_pred[4]]).T
fin_pred31 = model_pred(xgb_model[2], Xtest_md)
fin_pred32 = model_pred(lgbm_model[1], Xtest_mr)
Xfin = pd.DataFrame([fin_pred31, fin_pred32]).T
lreg = LogisticRegression(random_state = random_state).fit(Xpred, ytest)
submission3 = lreg.predict_proba(Xfin)[:,1]
submission3 = pd.DataFrame([idtest_mr, submission3]).T
submission3.columns = ['id','loan_status']
submission3['id'] = submission3['id'].astype("int")
display(submission3.head())
print(submission3.shape)

Unnamed: 0,id,loan_status
0,58645,0.968722
1,58646,0.027126
2,58647,0.580362
3,58648,0.026097
4,58649,0.036714


(39098, 2)


In [28]:
submission3.to_csv("/kaggle/working/submission3.csv", index=False)

## 4. `xgb31` + `lgbm21` + Weight Optimization

In [29]:
w = np.random.randn(Xpred.shape[1])
def loss_func(w):
    res = np.dot(Xpred, w) / np.sum(w)
    return roc_auc_score(ytest, res)
w = minimize(loss_func, w, tol=1e-12)['x']
submission4 = np.matmul(Xfin, w)/np.sum(w)
submission4 = pd.DataFrame([idtest_mr, submission4]).T
submission4.columns = ['id','loan_status']
submission4['id'] = submission4['id'].astype("int")
display(submission4.head())
print(submission4.shape)

Unnamed: 0,id,loan_status
0,58645,0.996701
1,58646,0.014332
2,58647,0.561394
3,58648,0.008813
4,58649,0.058578


(39098, 2)


In [30]:
submission4.to_csv("/kaggle/working/submission4.csv", index=False)

## 5. `xgb11` + `lgbm21` + Weight Optimization

In [31]:
Xpred = pd.DataFrame([y_pred[0], y_pred[4]]).T
w = np.random.randn(Xpred.shape[1])
def loss_func(w):
    res = np.dot(Xpred, w) / np.sum(w)
    return roc_auc_score(ytest, res)
w = minimize(loss_func, w, tol=1e-12)['x']
fin_pred41 = model_pred(xgb_model[0], Xtest_un)
Xfin = pd.DataFrame([fin_pred41, fin_pred22]).T
submission5 = np.matmul(Xfin, w)/np.sum(w)
submission5 = pd.DataFrame([idtest_mr, submission5]).T
submission5.columns = ['id','loan_status']
submission5['id'] = submission5['id'].astype("int")
display(submission5.head())
print(submission5.shape)

Unnamed: 0,id,loan_status
0,58645,0.998072
1,58646,0.016451
2,58647,0.597691
3,58648,0.006769
4,58649,0.050297


(39098, 2)


In [32]:
submission5.to_csv("/kaggle/working/submission5.csv", index=False)

# Final Variance Check (How Far Away Submissions are from 0.5)

In [33]:
for i, sub in enumerate([submission,submission2,submission3,submission4,submission5]):
    print("Submission " + str(i+1) + ": ")
    print(((sub["loan_status"]-0.5)**2).sum())

Submission 1: 
8342.152270201314
Submission 2: 
8273.392078564753
Submission 3: 
8263.092219130616
Submission 4: 
8240.49124627799
Submission 5: 
8308.605659901652
