In [None]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train_bank = pd.read_csv('/content/drive/MyDrive/CCF-contest/Personal_loan_default_forecast/train_public.csv')
train_internet = pd.read_csv('/content/drive/MyDrive/CCF-contest/Personal_loan_default_forecast/train_internet.csv')
test = pd.read_csv('/content/drive/MyDrive/CCF-contest/Personal_loan_default_forecast/test_public.csv')
train_bank.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault
0,1040418,240418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193,13,2.43,0,556.364,649.091,3,0,0.0,7734.231,91.8,0,0,1-Dec,5,1,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0
1,1025197,225197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491,30,11.005,1,715.0,893.75,3,0,0.0,31329.0,54.8,1,0,Apr-90,40642,1,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0
2,1009360,209360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459,8,6.409,0,774.545,903.636,5,0,0.0,18514.0,57.692,1,0,Oct-91,154,1,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0
3,1039708,239708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157,8,9.205,0,750.0,875.0,3,0,0.0,20707.0,42.6,0,0,1-Jun,0,1,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0
4,1027483,227483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38,21,15.578,0,609.091,710.606,15,0,0.0,14016.154,30.462,0,0,2-May,0,1,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0


### 数据预处理

In [None]:
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})

In [None]:
common_cols = []
for col in train_bank.columns:
    if col in train_internet.columns:
        common_cols.append(col)
    else: continue
len(common_cols)

36

In [None]:
print(len(train_bank.columns))
print(len(train_internet.columns))

39
42


In [None]:
train_bank_left = list(set(list(train_bank.columns)) - set(common_cols))
train_internet_left = list(set(list(train_internet.columns)) - set(common_cols))

train_bank_left

['known_dero', 'known_outstanding_loan', 'app_type']

In [None]:
train_internet_left

['sub_class', 'marriage', 'work_type', 'offsprings', 'f5', 'house_loan_status']

In [None]:
train1_data = train_internet[common_cols]
train2_data = train_bank[common_cols]
test_data = test[common_cols[:-1]]

In [None]:
import datetime

# 日期类型：issueDate，earliesCreditLine
# 转换为pandas中的日期类型
train1_data['issue_date'] = pd.to_datetime(train1_data['issue_date'])
# 提取多尺度特征
train1_data['issue_date_y'] = train1_data['issue_date'].dt.year
train1_data['issue_date_m'] = train1_data['issue_date'].dt.month
# 提取时间diff
# 设置初始的时间
base_time = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 转换为天为单位
train1_data['issue_date_diff'] = train1_data['issue_date'].apply(lambda x: x-base_time).dt.days
train1_data[['issue_date', 'issue_date_y', 'issue_date_m', 'issue_date_diff']]
train1_data.drop('issue_date', axis = 1, inplace = True)

In [None]:
# 日期类型：issueDate，earliesCreditLine
# 转换为pandas中的日期类型
train2_data['issue_date'] = pd.to_datetime(train2_data['issue_date'])
# 提取多尺度特征
train2_data['issue_date_y'] = train2_data['issue_date'].dt.year
train2_data['issue_date_m'] = train2_data['issue_date'].dt.month
# 提取时间diff
# 设置初始的时间
base_time = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 转换为天为单位
train2_data['issue_date_diff'] = train2_data['issue_date'].apply(lambda x: x-base_time).dt.days
train2_data[['issue_date', 'issue_date_y', 'issue_date_m', 'issue_date_diff']]
train2_data.drop('issue_date', axis = 1, inplace = True)


In [None]:
employer_type = train1_data['employer_type'].value_counts().index
industry = train1_data['industry'].value_counts().index

In [None]:
emp_type_dict = dict(zip(employer_type, [0,1,2,3,4,5]))
industry_dict = dict(zip(industry, [i for i in range(15)]))

In [None]:
train1_data['work_year'].dropna()
train2_data['work_year'].dropna()

work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1,
     '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9}
train1_data['work_year']  = train1_data['work_year'].map(work_year_map)
train2_data['work_year']  = train2_data['work_year'].map(work_year_map)

train1_data['class'] = train1_data['class'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6})
train2_data['class'] = train2_data['class'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6})

train1_data['employer_type'] = train1_data['employer_type'].map(emp_type_dict)
train2_data['employer_type'] = train2_data['employer_type'].map(emp_type_dict)

train1_data['industry'] = train1_data['industry'].map(industry_dict)
train2_data['industry'] = train2_data['industry'].map(industry_dict)

In [None]:
# 日期类型：issueDate，earliesCreditLine
#train[cat_features]
# 转换为pandas中的日期类型
test_data['issue_date'] = pd.to_datetime(test_data['issue_date'])
# 提取多尺度特征
test_data['issue_date_y'] = test_data['issue_date'].dt.year
test_data['issue_date_m'] = test_data['issue_date'].dt.month
# 提取时间diff
# 设置初始的时间
base_time = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
# 转换为天为单位
test_data['issue_date_diff'] = test_data['issue_date'].apply(lambda x: x-base_time).dt.days
test_data[['issue_date', 'issue_date_y', 'issue_date_m', 'issue_date_diff']]
test_data.drop('issue_date', axis = 1, inplace = True)
test_data['work_year'].dropna()

work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1,
     '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9}
test_data['work_year']  = test_data['work_year'].map(work_year_map)
test_data['class'] = test_data['class'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6})
test_data['employer_type'] = test_data['employer_type'].map(emp_type_dict)
test_data['industry'] = test_data['industry'].map(industry_dict)

In [None]:
train1 = train1_data.drop(['loan_id','user_id'], axis = 1, inplace = False)
# y_train1 = train_bank[['loan_id','isDefault']]

train2 = train2_data.drop(['loan_id','user_id'], axis = 1, inplace = False)
# y_train2 = train_internet[['loan_id','isdefault']]
# X_train = pd.concat([X_train1, X_train2])
# y_train = pd.concat([y_train1, y_train2])
total_data = pd.concat([train1,train2]).reset_index(drop=True)
total_data = total_data.dropna()

# default_df = total_data.loc[total_data['isDefault'] == 1]
# ndefault_df = total_data.loc[total_data['isDefault'] == 0][:137888]  ## 555177 not default columns and 137888 default columns
# resample_df = pd.concat([default_df, ndefault_df])

# Shuffle dataframe rows
# new_df = resample_df.sample(frac=1, random_state=42)


In [None]:

def clean_mon(x):
    mons = {'jan':1, 'feb':2, 'mar':3, 'apr':4,  'may':5,  'jun':6,
            'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    year_group = re.search('(\d+)', x)
    if year_group:
        year = int(year_group.group(1))
        if year < 22:
            year += 2000
        elif 100 > year > 22:
            year += 1900
        else:
            pass
    else:
        year = 2022
        
    month_group = re.search('([a-zA-Z]+)', x)
    if month_group:
        mon = month_group.group(1).lower()
        month = mons[mon]
    else:
        month = 0
        
    return year*100 + month

total_data['earlies_credit_mon'] = total_data['earlies_credit_mon'].apply(clean_mon)



In [None]:
test_data['earlies_credit_mon'] = test_data['earlies_credit_mon'].apply(clean_mon)

In [None]:
X_train = total_data.drop(['isDefault'], axis = 1, inplace = False)
y_train = total_data['isDefault'].astype(int)
X_test = test_data.drop(['loan_id','user_id'], axis = 1, inplace = False)

In [None]:
cate_cols = ['class', 'employer_type', 'industry','house_exist','censor_status','region','use','post_code','policy_code','initial_list_status']


## 模型使用
1) LigthGBM
2) NN

##### 使用internet和bank数据共同特征总量训练

In [None]:
# !pip3 install catboost

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import gc

In [None]:
!sudo apt-get install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev
!sudo apt-get update
!sudo apt-get install --no-install-recommends nvidia-375
!sudo apt-get install --no-install-recommends nvidia-opencl-icd-375 nvidia-opencl-dev opencl-headers

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
libboost-dev is already the newest version (1.65.1.0ubuntu1).
libboost-dev set to manually installed.
libboost-filesystem-dev is already the newest version (1.65.1.0ubuntu1).
libboost-filesystem-dev set to manually installed.
libboost-system-dev is already the newest version (1.65.1.0ubuntu1).
libboost-system-dev set to manually installed.
cmake is already the newest version (3.10.2-1ubuntu2.18.04.2).
git is already the newest version (1:2.17.1-1ubuntu0.9).
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

Cloning into 'LightGBM'...
remote: Enumerating objects: 25098, done.[K
remote: Counting objects: 100% (774/774), done.[K
remote: Compressing objects: 100% (502/502), done.[K
remote: Total 25098 (delta 488), reused 446 (delta 268), pack-reused 24324[K
Receiving objects: 100% (25098/25098), 18.79 MiB | 22.32 MiB/s, done.
Resolving deltas: 100% (18434/18434), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.       

In [None]:
%cd /content/LightGBM
!mkdir build
!cmake -D -DUSE_GPU=1 #avoid ..
!make -j$(nproc)

/content/LightGBM
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5")  
-- Performing Test MM_PREFETCH
-- Performing Test MM_PREFETCH - Success
-- Using _mm_prefetch
-- Performing Test MM_MALLOC
-- Performing Test MM_MALLOC - Success
-- Using _mm_malloc
-- Configuring done
-- Generating done
  Manually-specif

In [None]:
# !git clone https://github.com/guolinke/boosting_tree_benchmarks.git
# !cd boosting_tree_benchmarks/data
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
# !gunzip HIGGS.csv.gz
# !ls -ltra boosting_tree_benchmarks/data
# !python boosting_tree_benchmarks/data/higgs2libsvm.py
# !cd ../..
# !ln -s boosting_tree_benchmarks/data/higgs.train
# !ln -s boosting_tree_benchmarks/data/higgs.test
with open('lightgbm_gpu.conf', 'w') as f:
  f.write('''max_bin = 63
num_leaves = 255
num_iterations = 50
learning_rate = 0.1
tree_learner = serial
task = train
is_training_metric = false
min_data_in_leaf = 1
min_sum_hessian_in_leaf = 100
ndcg_eval_at = 1,3,5,10
device = gpu
num_threads=2
''')
# !./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc

In [None]:
# !rm -rf 

# !./lightgbm config=lightgbm_gpu.conf data=higgs.train valid=higgs.test objective=binary metric=auc/
# !./LightGBM/lightgbm config=LightGBM/lightgbm_gpu.conf data=LightGBM/higgs.train valid=LightGBM/higgs.test objective=binary metric=auc
# !./lightgbm config=lightgbm_gpu.conf data=higgs.train objective=binary metric=auc

In [None]:
# !pwd
# !cd LightGBM/

In [None]:
!pip --upgrade --force-reinstall install lightgbm --install-option=--gpu


Usage:   
  pip3 <command> [options]

no such option: --upgrade


In [None]:
%cd /content/LightGBM/python-package
!ls -ltra
!pip install setuptools
!sudo python setup.py install --precompile

/content/LightGBM/python-package
total 48
-rw-r--r--  1 root root 14454 Feb 18 08:58 README.rst
-rw-r--r--  1 root root  2369 Feb 18 08:58 MANIFEST.in
drwxr-xr-x  2 root root  4096 Feb 18 08:58 lightgbm
-rw-r--r--  1 root root 15696 Feb 18 08:58 setup.py
drwxr-xr-x  3 root root  4096 Feb 18 08:58 .
drwxr-xr-x 22 root root  4096 Feb 18 09:01 ..
running install
running build
running build_py
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt
creating build
creating build/lib
creating build/lib/lightgbm
copying lightgbm/sklearn.py -> build/lib/lightgbm
copying lightgbm/callback.py -> build/lib/lightgbm
copying lightgbm/libpath.py -> build/lib/lightgbm
copying lightgbm/__init__.py -> build/lib/lightgbm
copying lightgbm/dask.py -> build/lib/lightgbm
copying lightgbm/plotting.py -> build/lib/lightgbm
copying lightgbm/compat.py -> build/lib/lightgbm
copying lightgbm/basic.py -> 

In [None]:
lgb_params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'n_estimators': 20000,
        'random_state': 42,
        'learning_rate': 8e-3,
        'subsample': 0.6,
        'subsample_freq': 1,
        'colsample_bytree': 0.4,
        # 'reg_alpha': 10.0,
        # 'reg_lambda': 1e-1,
        'min_child_weight': 256,
        'min_child_samples': 500,
        'device' : 'gpu',
        'lambda_l1': 5, # L1 regularization
        'lambda_l2': 10 # L2 regularization
}



xgb_params = {'n_estimators': 10000,
        'learning_rate': 0.03689407512484644,
        'max_depth': 8,
        'colsample_bytree': 0.3723914688159835,
        'subsample': 0.780714581166012,
        'eval_metric': 'auc',
        'use_label_encoder': False,
        'gamma': 0,
        # 'reg_lambda': 50.0,
        'random_state': 42,
        'device' : 'gpu',
        'lambda_l1': 5, # L1 regularization
        'lambda_l2': 10 # L2 regularization
}

# cat_params = {'iterations': 17298,
#         'learning_rate': 0.03429054860458741,
#         'reg_lambda': 0.3242286463210283,
#         'subsample': 0.9433911589913944,
#         'random_strength': 22.4849972385133,
#         'depth': 8,
#         'min_data_in_leaf': 4,
#         'leaf_estimation_iterations': 8,
#         'task_type':"GPU",
#         'bootstrap_type':'Poisson',
#         'verbose' : 500,
#         'early_stopping_rounds' : 200,
#         'eval_metric' : 'AUC'}
lgb = LGBMClassifier(**lgb_params)
xgb = XGBClassifier(**xgb_params)
# cat = CatBoostClassifier(**cat_params)

In [None]:
def get_oof(feats, target, test, kfold, clf):
  oof_preds = np.zeros(feats.shape[0])
  sub_preds = np.zeros(test.shape[0]) 
  for fold_, (train_idx, valid_idx) in enumerate(kfold.split(feats,target)):
    print("fold n°{}".format(fold_ + 1))
    train_X, train_y = feats.iloc[train_idx], target.iloc[train_idx]
    valid_X, valid_y = feats.iloc[valid_idx], target.iloc[valid_idx]

    clf.fit(train_X, train_y, eval_set = [(valid_X, valid_y)], verbose = 500, early_stopping_rounds = 500, )
    oof_preds[valid_idx] = clf.predict_proba(valid_X)[:,1]
    sub_preds += clf.predict_proba(test)[:,1]
    del train_X, train_y, valid_X, valid_y
    gc.collect()

  evalution_result = roc_auc_score(target, oof_preds)
  print('*'*10)
  print('roc auc score:', evalution_result)
  print('*'*20)
  sub_preds_result = sub_preds / kfold.n_splits
  return oof_preds ,sub_preds_result



In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
oof_preds_1, sub_preds_1 = get_oof(X_train, y_train, X_test, kfold, lgb)
oof_preds_2, sub_preds_2 = get_oof(X_train, y_train, X_test, kfold, xgb)
# oof_preds_3, sub_preds_3 = get_oof(X_train, y_train, X_test, kfold, cat)

NameError: ignored

In [None]:
from sklearn.linear_model import RidgeClassifier
import numpy as np

def stack_model(train_stack, test_stack, y):  #oof_set =[oof_1, oof_2, oof_3, ..., oof_n], predictions_set =[predictions_1, predictions_2, predictions_3, ..., predictions_n],

    oof = np.zeros((train_stack.shape[0],))
    predictions = np.zeros((test_stack.shape[0],))
    scores = []

    for fold_, (trn_idx, val_idx) in enumerate(kfold.split(train_stack, y)):
        trn_data, trn_y = train_stack.iloc[trn_idx], y.iloc[trn_idx]
        val_data, val_y = train_stack.iloc[val_idx], y.iloc[val_idx]

        clf = RidgeClassifier(random_state=2099)
        clf.fit(trn_data, trn_y)

        oof[val_idx] = clf._predict_proba_lr(val_data)[:,1]
        predictions +=clf._predict_proba_lr(test_stack)[:,1] / kfold.n_splits

        score_single = roc_auc_score(val_y, oof[val_idx])
        scores.append(score_single)
    print('mean: ', np.mean(scores))

    return oof, predictions

In [None]:
pred_matrix = np.hstack([sub_preds_1[:,np.newaxis], sub_preds_2[:,np.newaxis]])
oof_matrix = np.hstack([oof_preds_1[:,np.newaxis], oof_preds_2[:,np.newaxis]])
pred_df = pd.DataFrame(pred_matrix)
oof_df = pd.DataFrame(oof_matrix)
oof_stack, predictions_stack = stack_model(oof_df, pred_df, y_train)

In [None]:
# submission
submission = pd.DataFrame({'id':test_data['loan_id'], 'isDefault':predictions_stack})
submission.to_csv('submission_new.csv', index = None)

In [None]:
import gc

from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import (roc_curve, auc, accuracy_score)

In [None]:
import lightgbm
from sklearn import metrics

In [None]:
#Initiate a model
params = {
    'application': 'binary', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'dart', # traditional gradient boosting decision tree
    'num_iterations': 500, 
    'learning_rate': 0.005,
    'num_leaves': 50,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 400, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'dart', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

# To view the default model parameters:
mdl.get_params().keys()


In [None]:
#Grid search
gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    # 'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    # 'colsample_bytree' : [0.64, 0.65, 0.66],
    # 'subsample' : [0.7,0.75],
    # 'reg_alpha' : [1,1.2],
    # 'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=3, n_jobs=5)
# Run the grid
grid.fit(X_res, y_res)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)


In [None]:
# params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate'] 
params['max_bin'] = grid.best_params_['max_bin']
params['num_leaves'] = grid.best_params_['num_leaves']
params['n_estimators'] = grid.best_params_['n_estimators']
# params['reg_alpha'] = grid.best_params_['reg_alpha']
# params['reg_lambda'] = grid.best_params_['reg_lambda']
# params['subsample'] = grid.best_params_['subsample']


# X_test = np.array(test.drop(['id'], axis=1))
# ids = test['id'].values


X_train, X_valid, y_train, y_valid = train_test_split(X_res, y_res, test_size=0.1, random_state = 42)
    
del X_res, y_res; gc.collect();

d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid) 

watchlist = [d_train, d_valid]


In [None]:
from sklearn.metrics import roc_auc_score
model = lgb.train(params, train_set=d_train, num_boost_round=1000, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=4)

p_valid = model.predict(X_valid)
print('LGBM: ', roc_auc_score(y_valid, p_valid))

In [None]:
import matplotlib.pyplot as plt
lgbm_fpr, lgbm_tpr, lgbm_thresold = roc_curve(y_valid, p_valid)

def graph_roc_curve_multiple(lgbm_fpr, lgbm_tpr):
    plt.figure(figsize=(8,8))
    plt.title('ROC Curve \n of LGBM', fontsize=18)
    plt.plot(lgbm_fpr, lgbm_tpr, label='LGBM Classifier Score: {:.4f}'.format(roc_auc_score(y_valid, p_valid)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(lgbm_fpr, lgbm_tpr)
plt.show()

In [None]:
# clf_ex=lightgbm.LGBMRegressor(n_estimators = 200)
# clf_ex.fit(X = X_train, y = y_train)
# clf_ex.booster_.save_model('LGBMmode.txt')
# pred_train = clf_ex.predict(X_train)
# pred = clf_ex.predict(X_test)

In [None]:
# submission
submission = pd.DataFrame({'id':X_test['loan_id'], 'is_default':pred})
submission.to_csv('submission.csv', index = None)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
print('LGBM: ', roc_auc_score(y_train, pred_train))

In [None]:
import matplotlib.pyplot as plt
lgbm_fpr, lgbm_tpr, lgbm_thresold = roc_curve(y_train, pred_train)

def graph_roc_curve_multiple(lgbm_fpr, lgbm_tpr):
    plt.figure(figsize=(8,8))
    plt.title('ROC Curve \n of LGBM', fontsize=18)
    plt.plot(lgbm_fpr, lgbm_tpr, label='LGBM Classifier Score: {:.4f}'.format(roc_auc_score(y_train, pred_train)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(lgbm_fpr, lgbm_tpr)
plt.show()

In [None]:
# submission
submission = pd.DataFrame({'id':test['loan_id'], 'is_default':pred})
submission.to_csv('submission.csv', index = None)

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
from sklearn.model_selection import train_test_split

# This is explicitly used for undersampling.
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

classifiers = {
    "LogisiticRegression": LogisticRegression(),
    # "KNearest": KNeighborsClassifier(),
    # "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

from sklearn.model_selection import cross_val_score


for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")




In [None]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV


# Logistic Regression 
# log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}



# grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
# grid_log_reg.fit(X_train, y_train)
# We automatically get the logistic regression with the best parameters.
# log_reg = grid_log_reg.best_estimator_

# knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
# grid_knears.fit(X_train, y_train)
# # KNears best estimator
# knears_neighbors = grid_knears.best_estimator_

# # Support Vector Classifier
# svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
# grid_svc = GridSearchCV(SVC(), svc_params)
# grid_svc.fit(X_train, y_train)

# # SVC best estimator
# svc = grid_svc.best_estimator_

# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)

# tree best estimator
tree_clf = grid_tree.best_estimator_

In [None]:
tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

In [None]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Create a DataFrame with all the scores and the classifiers names.

log_reg_pred = cross_val_predict(log_reg, X_train, y_train.ravel(), cv=5, method="decision_function")

# knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)

# svc_pred = cross_val_predict(svc, X_train, y_train.ravel(), cv=5,
#                              method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)

In [None]:
from sklearn.metrics import roc_auc_score

print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
# print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
# print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train, tree_pred))

In [None]:
import matplotlib.pyplot as plt
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
# knear_fpr, knear_tpr, knear_threshold = roc_curve(y_train, knears_pred)
# svc_fpr, svc_tpr, svc_threshold = roc_curve(y_train, svc_pred)
tree_fpr, tree_tpr, tree_threshold = roc_curve(y_train, tree_pred)


def graph_roc_curve_multiple(log_fpr, log_tpr, tree_fpr, tree_tpr):
    plt.figure(figsize=(8,8))
    plt.title('ROC Curve \n Top 3 Classifiers', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y_train, log_reg_pred)))
    plt.plot(tree_fpr, tree_tpr, label='Decision Tree Classifier Score: {:.4f}'.format(roc_auc_score(y_train, tree_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(log_fpr, log_tpr, tree_fpr, tree_tpr)
plt.show()

In [None]:
# submission
submission = pd.DataFrame({'id':test['loan_id'], 'is_default':pred})
submission.to_csv('submission.csv', index = None)

#### NN

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Sequential
from keras.layers import Dense , Dropout , Lambda, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.model_selection import train_test_split
from keras import  backend as K
from keras import models
from keras.preprocessing.image import ImageDataGenerator

In [None]:
# 数据标准化
import numpy as np
train1 = train1_data.drop(['earlies_credit_mon','loan_id','user_id'], axis = 1, inplace = False)
# y_train1 = train_bank[['loan_id','isDefault']]

train2 = train2_data.drop(['earlies_credit_mon','loan_id','user_id'], axis = 1, inplace = False)
# y_train2 = train_internet[['loan_id','isdefault']]
# X_train = pd.concat([X_train1, X_train2])
# y_train = pd.concat([y_train1, y_train2])
total_data=pd.concat([train1,train2])
total_data = total_data.dropna()
default_df = total_data.loc[total_data['isDefault'] == 1]
ndefault_df = total_data.loc[total_data['isDefault'] == 0][:137888]  ## 555177 not default columns and 137888 default columns
resample_df = pd.concat([default_df, ndefault_df])

# Shuffle dataframe rows
new_df = resample_df.sample(frac=1, random_state=42)
X_res = new_df.drop(['isDefault'], axis = 1, inplace = False)
y_res = new_df['isDefault']

X_res = total_data.drop(['isDefault'], axis = 1, inplace = False)
y_res = total_data['isDefault']
X_test = test_data.drop(['earlies_credit_mon','loan_id','user_id'], axis = 1, inplace = False)


In [None]:
# 缺失值填补
X_res.fillna(0, inplace = True)
X_test.fillna(0, inplace = True)

In [None]:
X_train = X_res.to_numpy()
X_test=X_test.to_numpy()
# mean_px = X_res.mean().astype(np.float32)
mean_px = X_train.mean(axis=0)
# std_px = X_res.std().astype(np.float32)
std_px = X_train.std(axis=0)
def standardize(x): 
    return (x-mean_px)/std_px

In [None]:
X_train_NN =(X_train - mean_px) / std_px
X_test_NN  = (X_test - mean_px) / std_px

X_train_NN = (X_res.values).astype('float32') # all pixel values
y_train_NN = y_res

X_test_NN = X_test.astype('float32') # all pixel values

In [None]:
# 修改初始化、加归一层、加dropout、改用不同的metrics
seed = 43
np.random.seed(seed)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import AUC

def auroc(y_true, y_pred):
    return tf.compat.v1.py_func(roc_auc_score, (y_true, y_pred), tf.double)

input_shape = X_train_NN.shape[1]
b_size = 2000
max_epochs = 20

import tensorflow.keras as K
init = K.initializers.glorot_uniform(seed=1)
simple_adam = K.optimizers.Adam(lr=0.001)

model = K.models.Sequential()
model.add(K.layers.Dense(units=256, input_dim=input_shape, kernel_initializer='he_normal', activation='relu',kernel_regularizer=l2(0.0001)))
model.add(K.layers.LayerNormalization())
model.add(K.layers.Dropout(0.3))
model.add(K.layers.Dense(units= 64, kernel_initializer='he_normal', activation='relu'))
model.add(K.layers.LayerNormalization())
model.add(K.layers.Dropout(0.3))
model.add(K.layers.Dense(units=1, kernel_initializer='he_normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=simple_adam, metrics=['accuracy',AUC(name='auc')])

In [None]:
model.summary()

In [None]:
print("Starting NN training")
h = model.fit(X_train_NN, y_train_NN, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("NN training finished")

In [None]:
pred_NN = model.predict(X_test_NN)
pred_NN = [item[0] for item in pred_NN]

In [None]:
model.save('NN_model.h5')
submission = pd.DataFrame({'id':test['loan_id'], 'is_default':pred_NN})
submission.to_csv('submission.csv', index = None)

# 其他尝试...

Pytorch NN

In [None]:
# pytorch mlp for binary classification
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
import torch.nn as nn
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
from torch.optim import optimizer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cpu


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_res, y_res, test_size=0.33, random_state=69)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [None]:
EPOCHS = 50  ##100
BATCH_SIZE = 100  ##change to 1000
LEARNING_RATE = 0.001

In [None]:
X_train_tensor = torch.Tensor(X_train)
y_train_tensor = torch.Tensor(y_train.values)

In [None]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), torch.FloatTensor(y_train.values))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_valid))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(34, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
model = binaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
###################### OUTPUT ######################

binaryClassification(
  (layer_1): Linear(in_features=34, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.42281 | Acc: 80.707
Epoch 002: | Loss: 0.40558 | Acc: 81.368
Epoch 003: | Loss: 0.40391 | Acc: 81.470
Epoch 004: | Loss: 0.40289 | Acc: 81.492
Epoch 005: | Loss: 0.40199 | Acc: 81.571
Epoch 006: | Loss: 0.40132 | Acc: 81.561
Epoch 007: | Loss: 0.40106 | Acc: 81.575
Epoch 008: | Loss: 0.40056 | Acc: 81.606
Epoch 009: | Loss: 0.40019 | Acc: 81.610
Epoch 010: | Loss: 0.40002 | Acc: 81.677
Epoch 011: | Loss: 0.39960 | Acc: 81.655
Epoch 012: | Loss: 0.39932 | Acc: 81.671
Epoch 013: | Loss: 0.39930 | Acc: 81.683
Epoch 014: | Loss: 0.39914 | Acc: 81.716
Epoch 015: | Loss: 0.39887 | Acc: 81.733
Epoch 016: | Loss: 0.39869 | Acc: 81.714
Epoch 017: | Loss: 0.39857 | Acc: 81.699
Epoch 018: | Loss: 0.39856 | Acc: 81.712
Epoch 019: | Loss: 0.39842 | Acc: 81.701
Epoch 020: | Loss: 0.39829 | Acc: 81.726
Epoch 021: | Loss: 0.39808 | Acc: 81.727
Epoch 022: | Loss: 0.39800 | Acc: 81.726
Epoch 023: | Loss: 0.39767 | Acc: 81.705
Epoch 024: | Loss: 0.39782 | Acc: 81.707
Epoch 025: | Los

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_valid, y_pred_list)

array([[176196,   6914],
       [ 35077,  10525]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, y_pred_list))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89    183110
           1       0.60      0.23      0.33     45602

    accuracy                           0.82    228712
   macro avg       0.72      0.60      0.61    228712
weighted avg       0.79      0.82      0.78    228712

