In [3]:
import numpy as np
import pandas as pd
import datetime as dt
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import pickle
import datetime as dt
import scipy
from sklearn.metrics import roc_auc_score,mean_squared_error
def get_feature_importances(train_data,label, cat_cols,seed=None): 
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(train_data, label, free_raw_data=False, silent=True,categorical_feature=cat_cols)
    lgb_params = {'num_leaves': 50,
                'num_threads':8,
                'device_type':'cpu',
                'max_bin':150,
                 'min_data_in_leaf': 30, 
                 'objective':'multiclass',
                 'max_depth': 3,
                 "metric": 'multi_logloss',
                 "lambda_l1": 0.2,
                 'colsample_bytree': 0.7, 
                 "lambda_l2":0.2,
                 "verbosity": -1,
                 "random_state": 2019,
                 "num_class":33}
    
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=300)

    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_data.columns)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    
    return imp_df

In [4]:
pd.__version__

'0.24.2'

In [17]:
feature_imp = pd.read_csv("feature_importance.csv",index_col=0)

In [67]:
train_data = pd.read_csv("ppd_model/train/train_data.csv",sep='\t',names=data_cols)
eval_data = pd.read_csv("ppd_model/eval/eval_data.csv",sep='\t',names=data_cols)
total_train = pd.concat([train_data,eval_data],ignore_index=True)
train_num = len(total_train)
data_cols.remove('label')
test_data = pd.read_csv("ppd_model/test/test_data.csv",sep='\t',names=data_cols)
total_data = pd.concat([total_train,test_data],ignore_index=True)
total_data.drop('label',axis=1,inplace=True)
del train_data,eval_data,test_data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from scipy.stats import kurtosis
import time
import warnings
warnings.filterwarnings('ignore')
tag_cv = CountVectorizer(min_df=10, max_df=0.9).fit_transform(total_data['tag_list'])

In [68]:
total_data.drop('tag_list',axis=1,inplace=True)

In [69]:
train_label = total_train.label.values
total_train.drop('label',axis=1,inplace=True)

In [70]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
total_data['gender'] = le.fit_transform(total_data['gender'])

le = LabelEncoder()
total_data['cell_province'] = le.fit_transform(total_data['cell_province'])
# eval_data['cell_province'] = le.transform(eval_data['cell_province'])

le = LabelEncoder()
total_data['id_province'] = le.fit_transform(total_data['id_province'])
# eval_data['id_province'] = le.transform(eval_data['id_province'])

le = LabelEncoder()
total_data['id_city'] = le.fit_transform(total_data['id_city'])
# eval_data['id_city'] = le.transform(eval_data['id_city'])


In [55]:
log_cols = []
for col in total_data.columns:
    if col not in ['label','user_id','listing_id']+cat_cols+no_log_cols+unused_cols:
        log_cols.append(col)

In [72]:
no_log_cols = ['age','default_amt_max','default_amt_mean','default_amt_min','default_times','default_times_pct',
              'first_prepay_days_mean','normal_times','normal_times_pct','prepay_days_max','prepay_days_mean',
              'prepay_time_pct','max_month_diff','reg_interval','mean_rate_12m','mean_rate_24m','mean_rate_6m','mean_term_12m',
              'mean_term_24m','mean_term_6m','last_1_bhmonth_type1_dawn_click_pct','last_3_bhmonth_type1_dawn_click_pct',
              'last_6_bhmonth_type1_dawn_click_pct','last_6_bhmonth_type23_dawn_click_pct','last_12_bhmonth_type1_dawn_click_pct',
              'last_12_bhmonth_type23_dawn_click_pct','tag_cnt','tag_list','max_interval']
unused_cols = ['max_rate_24m','max_rate_6m','max_term_12m','max_term_24m','min_rate_12m',
               'max_rate_24m','min_term_6m', 'min_rate_6m','min_term_12m','min_term_24m']
cat_cols = ['due_date_day','cell_change','max_rate_12m',
           'max_term_6m','min_rate_12m','min_rate_24m','min_term_6m','cell_province',
            'id_province','id_city','gender','first_prepay_days_max','first_prepay_days_min','first_prepay_days_mode',
            'prepay_days_max','prepay_days_mode','due_date_weekday']
def data_preprocess(df):
    for col in df.columns:
        if col not in ['label','user_id','listing_id']+unused_cols+no_log_cols+cat_cols:
            df[col] = df[col].map(lambda x: np.log(x+1))

    return df

In [73]:
total_features = data_preprocess(total_data)

In [None]:
total_features = pd.get_dummies(total_features, columns=cat_cols)
total_features = sparse.hstack((total_features.values, tag_cv), format='csr', dtype='float32')
train_values, test_values = total_features[:train_num], total_features[train_num:]

In [83]:
from lightgbm.sklearn import LGBMClassifier
train_due_amt_df = total_train['due_amt'].values
amt_labels = total_train['due_amt'].values

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
clf = LGBMClassifier(
num_leaves= 50,
num_threads=8,
 max_bin=150,
 min_data_in_leaf= 500, 
 objective='multiclass',
 learning_rate= 0.04,
 boosting= 'gbdt',
 feature_fraction= 0.8,
 bagging_fraction= 0.8,
 metric= 'multi_logloss',
 lambda_l1= 0.2,
 colsample_bytree= 0.7, 
 lambda_l2=0.2,
 verbosity= -1,
 random_state= 2019,
 num_class=33,
 n_estimators=10000)

amt_oof = np.zeros(train_num)
prob_oof = np.zeros((train_num, 33))
test_pred_prob = np.zeros((test_values.shape[0], 33))
for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, train_label)):
    print(i, 'fold...')
    t = time.time()
    trn_x, trn_y = train_values[trn_idx], train_label[trn_idx]
    val_x, val_y = train_values[val_idx], train_label[val_idx]
    val_repay_amt = amt_labels[val_idx]
    val_due_amt = train_due_amt_df[val_idx]

    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        early_stopping_rounds=500, verbose=50
    )
    # shepe = (-1, 33)
    val_pred_prob_everyday = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    prob_oof[val_idx] = val_pred_prob_everyday
    val_pred_prob_today = [val_pred_prob_eveprob_oofryday[i][val_y[i]] for i in range(val_pred_prob_everyday.shape[0])]
    val_pred_repay_amt = val_due_amt['due_amt'].values * val_pred_prob_today
    print('val rmse:', np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt)))
#     print('val mae:', mean_absolute_error(val_repay_amt, val_pred_repay_amt))
    amt_oof[val_idx] = val_pred_repay_amt
    test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits

    print('runtime: {}\n'.format(time.time() - t))

print('cv logloss:', log_loss(clf_labels, prob_oof))
print('cv acc:', accuracy_score(clf_labels, np.argmax(prob_oof, axis=1)))

In [None]:
clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits

In [None]:
clf.booster_.save_model("lightgbm_model.txt")

In [None]:
prob_cols = ['prob_{}'.format(i) for i in range(33)]
for i, f in enumerate(prob_cols):
    sub[f] = test_pred_prob[:, i]
sub_example = pd.read_csv('dataset/submission.csv', parse_dates=['repay_date'])
sub_example = sub_example.merge(sub, on='listing_id', how='left')
sub_example['days'] = (sub_example['repay_date'] - sub_example['auditing_date']).dt.days
# shape = (-1, 33)
test_prob = sub_example[prob_cols].values
test_labels = sub_example['days'].values
test_prob = [test_prob[i][test_labels[i]] for i in range(test_prob.shape[0])]
sub_example['repay_amt'] = sub_example['due_amt'] * test_prob
sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv('sub.csv', index=False)

In [10]:
imp_df = get_feature_importances(train_features.drop(['user_id','listing_id'],axis=1),train_label,cat_cols,seed=2019)
imp_df.to_csv("feature_importance.csv")

In [64]:
data_cols= ['label',
'user_id',
'listing_id',
'due_amt',
'due_date_day',
'default_amt_max',
'default_amt_mean',
'default_amt_min',
'default_times',
'default_times_pct',
'first_prepay_days_max',
'first_prepay_days_mean',
'first_prepay_days_min',
'first_prepay_days_mode',
'first_prepay_due_amt_max',
'first_prepay_due_amt_mean',
'first_prepay_due_amt_min',
'max_continue_repay_times_once',
'mean_coninue_repay_times_once',
'normal_amt_max',
'normal_amt_mean',
'normal_amt_min',
'normal_times',
'normal_times_pct',
'prepay_days_max',
'prepay_days_mean',
'prepay_days_mode',
'prepay_time',
'prepay_time_pct',
'weighted_amt_max',
'weighted_amt_mean',
'weighted_amt_min',
'last_12_bhmonth_type1_cnt',
'last_12_bhmonth_type1_dawn_click_pct',
'last_12_bhmonth_type1_max_click_cnt_day',
'last_12_bhmonth_type1_max_cnt_day',
'last_12_bhmonth_type1_max_cnt_day_dawn',
'last_12_bhmonth_type1_max_cnt_hour',
'last_12_bhmonth_type1_mean_cnt_day',
'last_12_bhmonth_type1_mean_cnt_day_dawn',
'last_12_bhmonth_type1_mean_cnt_hour',
'last_12_bhmonth_type1_min_cnt_day',
'last_12_bhmonth_type1_std_cnt_day',
'last_12_bhmonth_type1_std_cnt_day_dawn',
'last_12_bhmonth_type1_std_cnt_hour',
'last_12_bhmonth_type23_cnt',
'last_12_bhmonth_type23_dawn_click_pct',
'last_12_bhmonth_type23_max_click_cnt_day',
'last_12_bhmonth_type23_max_cnt_day',
'last_12_bhmonth_type23_max_cnt_day_dawn',
'last_12_bhmonth_type23_max_cnt_hour',
'last_12_bhmonth_type23_mean_cnt_day',
'last_12_bhmonth_type23_mean_cnt_day_dawn',
'last_12_bhmonth_type23_mean_cnt_hour',
'last_12_bhmonth_type23_min_cnt_day',
'last_12_bhmonth_type23_std_cnt_day',
'last_12_bhmonth_type23_std_cnt_day_dawn',
'last_12_bhmonth_type23_std_cnt_hour',
'last_1_bhmonth_type1_cnt',
'last_1_bhmonth_type1_dawn_click_pct',
'last_1_bhmonth_type1_max_click_cnt_day',
'last_1_bhmonth_type1_max_cnt_day',
'last_1_bhmonth_type1_max_cnt_day_dawn',
'last_1_bhmonth_type1_max_cnt_hour',
'last_1_bhmonth_type1_mean_cnt_day',
'last_1_bhmonth_type1_mean_cnt_day_dawn',
'last_1_bhmonth_type1_mean_cnt_hour',
'last_1_bhmonth_type1_min_cnt_day',
'last_1_bhmonth_type1_std_cnt_day',
'last_1_bhmonth_type1_std_cnt_day_dawn',
'last_1_bhmonth_type1_std_cnt_hour',
'last_1_bhmonth_type23_cnt',
'last_1_bhmonth_type23_dawn_click_pct',
'last_1_bhmonth_type23_max_click_cnt_day',
'last_1_bhmonth_type23_max_cnt_day',
'last_1_bhmonth_type23_max_cnt_day_dawn',
'last_1_bhmonth_type23_max_cnt_hour',
'last_1_bhmonth_type23_mean_cnt_day',
'last_1_bhmonth_type23_mean_cnt_day_dawn',
'last_1_bhmonth_type23_mean_cnt_hour',
'last_1_bhmonth_type23_min_cnt_day',
'last_1_bhmonth_type23_std_cnt_day',
'last_1_bhmonth_type23_std_cnt_day_dawn',
'last_1_bhmonth_type23_std_cnt_hour',
'last_3_bhmonth_type1_cnt',
'last_3_bhmonth_type1_dawn_click_pct',
'last_3_bhmonth_type1_max_click_cnt_day',
'last_3_bhmonth_type1_max_cnt_day',
'last_3_bhmonth_type1_max_cnt_day_dawn',
'last_3_bhmonth_type1_max_cnt_hour',
'last_3_bhmonth_type1_mean_cnt_day',
'last_3_bhmonth_type1_mean_cnt_day_dawn',
'last_3_bhmonth_type1_mean_cnt_hour',
'last_3_bhmonth_type1_min_cnt_day',
'last_3_bhmonth_type1_std_cnt_day',
'last_3_bhmonth_type1_std_cnt_day_dawn',
'last_3_bhmonth_type1_std_cnt_hour',
'last_3_bhmonth_type23_cnt',
'last_3_bhmonth_type23_dawn_click_pct',
'last_3_bhmonth_type23_max_click_cnt_day',
'last_3_bhmonth_type23_max_cnt_day',
'last_3_bhmonth_type23_max_cnt_day_dawn',
'last_3_bhmonth_type23_max_cnt_hour',
'last_3_bhmonth_type23_mean_cnt_day',
'last_3_bhmonth_type23_mean_cnt_day_dawn',
'last_3_bhmonth_type23_mean_cnt_hour',
'last_3_bhmonth_type23_min_cnt_day',
'last_3_bhmonth_type23_std_cnt_day',
'last_3_bhmonth_type23_std_cnt_day_dawn',
'last_3_bhmonth_type23_std_cnt_hour',
'last_6_bhmonth_type1_cnt',
'last_6_bhmonth_type1_dawn_click_pct',
'last_6_bhmonth_type1_max_click_cnt_day',
'last_6_bhmonth_type1_max_cnt_day',
'last_6_bhmonth_type1_max_cnt_day_dawn',
'last_6_bhmonth_type1_max_cnt_hour',
'last_6_bhmonth_type1_mean_cnt_day',
'last_6_bhmonth_type1_mean_cnt_day_dawn',
'last_6_bhmonth_type1_mean_cnt_hour',
'last_6_bhmonth_type1_min_cnt_day',
'last_6_bhmonth_type1_std_cnt_day',
'last_6_bhmonth_type1_std_cnt_day_dawn',
'last_6_bhmonth_type1_std_cnt_hour',
'last_6_bhmonth_type23_cnt',
'last_6_bhmonth_type23_dawn_click_pct',
'last_6_bhmonth_type23_max_click_cnt_day',
'last_6_bhmonth_type23_max_cnt_day',
'last_6_bhmonth_type23_max_cnt_day_dawn',
'last_6_bhmonth_type23_max_cnt_hour',
'last_6_bhmonth_type23_mean_cnt_day',
'last_6_bhmonth_type23_mean_cnt_day_dawn',
'last_6_bhmonth_type23_mean_cnt_hour',
'last_6_bhmonth_type23_min_cnt_day',
'last_6_bhmonth_type23_std_cnt_day',
'last_6_bhmonth_type23_std_cnt_day_dawn',
'last_6_bhmonth_type23_std_cnt_hour',
'max_month_diff',
'mean_month_diff',
'min_month_diff',
'sum_month_diff',
'gender',
'age',
'cell_province',
'id_province',
'id_city',
'max_interval',
'reg_interval',
'cell_change',
'listing_num_12m',
'listing_num_24m',
'listing_num_6m',
'max_interset_per_12m',
'max_interset_per_24m',
'max_interset_per_6m',
'max_listing_per_day_12m',
'max_listing_per_day_24m',
'max_listing_per_day_6m',
'max_listing_per_month_12m',
'max_listing_per_month_24m',
'max_listing_per_month_6m',
'max_principal_12m',
'max_principal_24m',
'max_principal_6m',
'max_rate_12m',
'max_rate_24m',
'max_rate_6m',
'max_term_12m',
'max_term_24m',
'max_term_6m',
'mean_interset_per_12m',
'mean_interset_per_24m',
'mean_interset_per_6m',
'mean_principal_12m',
'mean_principal_24m',
'mean_principal_6m',
'mean_rate_12m',
'mean_rate_24m',
'mean_rate_6m',
'mean_term_12m',
'mean_term_24m',
'mean_term_6m',
'min_interset_per_12m',
'min_interset_per_24m',
'min_interset_per_6m',
'min_principal_12m',
'min_principal_24m',
'min_principal_6m',
'min_rate_12m',
'min_rate_24m',
'min_rate_6m',
'min_term_12m',
'min_term_24m',
'min_term_6m',
'sum_interset_per_12m',
'sum_interset_per_24m',
'sum_interset_per_6m',
'sum_principal_12m',
'sum_principal_24m',
'sum_principal_6m',
'tag_cnt',
'tag_list',
'due_date_weekday']