In [63]:
import os, sys, warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

import pandas as pd
import numpy as np
from datetime import datetime as dt
import re

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from pygam import LinearGAM, s, f

import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedShuffleSplit as SSsplit
from sklearn.metrics import mean_squared_error as mse
def rmse(pred, true) : return np.sqrt(mse(true, pred))

import matplotlib.pyplot as plt
import seaborn as sns

## data Load

In [80]:
write_path = '../'

In [2]:
read_path = '../dataset/fin/'
write_path = '../dataset/result/'
train = pd.read_csv(read_path+'train_fe.csv')
test = pd.read_csv(read_path+'test_fe.csv')

In [3]:
files = os.listdir(read_path)
newFiles = []
histFiles = []
transFiles = []
isFile = re.compile(r'.*[.]csv')
isNew = re.compile(r'.*_new[.]csv')
isHist = re.compile(r'.*_hist[.]csv')
isTrans = re.compile(r'.*_trans[.]csv')
for file in tqdm(files):
    if re.match(isFile, file):
        locals()[file[:-4]] = pd.read_csv(read_path+file)
        if re.match(isNew, file): newFiles.append(file[:-4])
        elif re.match(isHist, file): histFiles.append(file[:-4])
        elif re.match(isTrans, file): transFiles.append(file[:-4])
        else : print('is it proper file name? : {}'.format(file))

HBox(children=(IntProgress(value=0, max=54), HTML(value='')))

is it proper file name? : test_fe.csv
is it proper file name? : train_fe.csv



In [4]:
tempCols = ['merchant_try_'+col if col!='card_id' else col for col in locals()['mertry_trans'].columns.tolist()]
locals()['mertry_trans'].columns = tempCols
tempCols = ['merchant_visit_'+col if col!='card_id' else col for col in locals()['mervisit_trans'].columns.tolist()]
locals()['mervisit_trans'].columns = tempCols

In [8]:
train_trans = train
transFiles.remove('sm_test_trans',)
for file in tqdm(transFiles):
    train_trans = train_trans.merge(locals()[file], how='left', on='card_id')
    if train_trans.shape[0] != train.shape[0] : print('it is wrong : {} : {}'.format(train_trans.shape, file))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




In [9]:
test_trans = test
transFiles.append('sm_test_trans')
transFiles.remove('sm_train_trans',)
for file in tqdm(transFiles):
    test_trans = test_trans.merge(locals()[file], how='left', on='card_id')
    if test_trans.shape[0] != test.shape[0] : print('it is wrong : {} : {}'.format(test_trans.shape, file))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




In [6]:
sm_test_trans.shape, regular_FE_trans.shape

((123623, 6), (325540, 8))

In [10]:
modelCols = pd.read_csv('../dataset/modelCols/trans_401.csv')['modelCols'].values.tolist()
modelCols+=sm_test_trans.columns[1:].tolist()
modelCols+=regular_FE_trans.columns[1:].tolist()

catCols = []
isFeature = re.compile(r'feature_[\d]')
isModeKey = re.compile(r'.*_modeKey')

for col in modelCols:
    if re.match(isFeature, col): catCols.append(col)
    elif re.match(isModeKey, col): catCols.append(col)

catCols2 = []
for col in catCols:
    if train_trans[col].isna().sum()==0:
        catCols2.append(modelCols.index(col))

## CV Data

In [113]:
split_rate = test.shape[0]/(train.shape[0]+test.shape[0])
split_y = train['outliers']
SSspliter = SSsplit(3, split_rate)

# train_trans2 = train_trans[modelCols].replace([np.inf, -np.inf], np.nan)
# train_trans2 = train_trans2.dropna(axis = 1)
for i, (train_index, test_index) in enumerate(SSspliter.split(train, split_y)):
    locals()['x_train_'+str(i)] = train_trans[modelCols].iloc[train_index]
    locals()['x_validate_'+str(i)] = train_trans[modelCols].iloc[test_index]
    locals()['y_train_'+str(i)] = train_trans['target'].iloc[train_index]
    locals()['y_validate_'+str(i)] = train_trans['target'].iloc[test_index]

## CV

In [116]:
params = {'num_leaves': 935, 
          'max_depth': 245, 
          'learning_rate': 0.040630726310134826, 
          'num_estimators': 1436, 
          'subsample_for_bin': 6342, 
          'min_split_gain': 0.0004120715698725839, 
          'min_child_samples': 200, 
          'reg_labmda': 0.30042871016116973, 
          'drop_rate': 0.3708866605608058, 
          'boosting': 'gbdt',
    'objective': 'regression',
    'num_threads': 8,
    }
train_score = 0
validate_score = 0
for i in tqdm(range(3)):
    lgb_data = lgb.Dataset(globals()['x_train_'+str(i)], label = globals()['y_train_'+str(i)].values, categorical_feature=catCols)
    bst = lgb.train(params, lgb_data)
    locals()['lgb_pred_'+str(i)] = bst.predict(globals()['x_train_'+str(i)])
    locals()['lgb_validate_'+str(i)] = bst.predict(globals()['x_validate_'+str(i)])
    train_score += rmse(locals()['lgb_pred_'+str(i)], globals()['y_train_'+str(i)])
    validate_score += rmse(locals()['lgb_validate_'+str(i)], globals()['y_validate_'+str(i)])
print('socres', train_score/3, validate_score/3, sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

socres : 3.196229964215437 : 3.6811556138959425


In [119]:
params = {'num_leaves': 935, 
          'max_depth': 245, 
          'learning_rate': 0.040630726310134826, 
          'num_estimators': 1000, 
          'subsample_for_bin': 6342, 
          'min_split_gain': 0.0004120715698725839, 
          'min_child_samples': 200, 
          'reg_labmda': 0.30042871016116973, 
          'drop_rate': 0.3708866605608058, 
          'boosting': 'gbdt',
    'objective': 'regression',
    'num_threads': 8,
    }
train_score = 0
validate_score = 0
for i in tqdm(range(3)):
    lgb_data = lgb.Dataset(globals()['x_train_'+str(i)], label = globals()['y_train_'+str(i)].values, categorical_feature=catCols)
    bst = lgb.train(params, lgb_data)
    locals()['lgb_pred_'+str(i)] = bst.predict(globals()['x_train_'+str(i)])
    locals()['lgb_validate_'+str(i)] = bst.predict(globals()['x_validate_'+str(i)])
    train_score += rmse(locals()['lgb_pred_'+str(i)], globals()['y_train_'+str(i)])
    validate_score += rmse(locals()['lgb_validate_'+str(i)], globals()['y_validate_'+str(i)])
print('socres', train_score/3, validate_score/3, sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

socres : 3.196229964215437 : 3.6811556138959425


## xgb

In [118]:
model = XGBRegressor(n_estimators=900,
              importance_type='gain',
              n_jobs=8,  
              silent=True, 
              objective='reg:linear',)
train_score = 0
validate_score = 0
for i in tqdm(range(3)):
    model.fit(globals()['x_train_'+str(i)], globals()['y_train_'+str(i)].values)
    locals()['xgb_pred_'+str(i)] = model.predict(globals()['x_train_'+str(i)])
    locals()['xgb_validate_'+str(i)] = model.predict(globals()['x_validate_'+str(i)])
    train_score += rmse(locals()['xgb_pred_'+str(i)], globals()['y_train_'+str(i)])
    validate_score += rmse(locals()['xgb_validate_'+str(i)], globals()['y_validate_'+str(i)])
print('socres', train_score/3, validate_score/3, sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

socres : 3.3547971798718113 : 3.7101222066304764


## Cat Boost

In [117]:
model = CatBoostRegressor(iterations=878, learning_rate=0.042356140476210384,
#                         depth=trial.suggest_int('depth', 3, 400),
                        silent=True,
#                         early_stopping_rounds=10,
                        loss_function='RMSE',
                        thread_count=8,)
train_score = 0
validate_score = 0
for i in tqdm(range(3)):
    train_pool = Pool(globals()['x_train_'+str(i)], globals()['y_train_'+str(i)].values, cat_features=catCols2)
    test_pool = Pool(globals()['x_validate_'+str(i)], cat_features=catCols2) 
    model.fit(train_pool, silent=True)
    locals()['cat_pred_'+str(i)] = model.predict(train_pool)
    locals()['cat_validate_'+str(i)] = model.predict(test_pool)
    train_score += rmse(locals()['cat_pred_'+str(i)], globals()['y_train_'+str(i)])
    validate_score += rmse(locals()['cat_validate_'+str(i)], globals()['y_validate_'+str(i)])
print('socres : ', train_score/3, validate_score/3, sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

socres : :3.5310458737970336:3.68123711406974


In [120]:
model = CatBoostRegressor(iterations=1078, learning_rate=0.052356140476210384,
#                         depth=trial.suggest_int('depth', 3, 400),
                        silent=True,
#                         early_stopping_rounds=10,
                        loss_function='RMSE',
                        thread_count=8,)
train_score = 0
validate_score = 0
for i in tqdm(range(3)):
    train_pool = Pool(globals()['x_train_'+str(i)], globals()['y_train_'+str(i)].values, cat_features=catCols2)
    test_pool = Pool(globals()['x_validate_'+str(i)], cat_features=catCols2) 
    model.fit(train_pool, silent=True)
    locals()['cat_pred_'+str(i)] = model.predict(train_pool)
    locals()['cat_validate_'+str(i)] = model.predict(test_pool)
    train_score += rmse(locals()['cat_pred_'+str(i)], globals()['y_train_'+str(i)])
    validate_score += rmse(locals()['cat_validate_'+str(i)], globals()['y_validate_'+str(i)])
print('socres : ', train_score/3, validate_score/3, sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

socres :  : 3.457500047665159 : 3.6805167552495632


## Stacking + Blending

In [121]:
for i in tqdm(range(3)):
    print('-'*10, str(i), 'fold', '-'*10)
    pred = (locals()['lgb_validate_'+str(i)] + locals()['cat_validate_'+str(i)] + locals()['xgb_validate_'+str(i)])/3
    print('fin score', rmse(pred, locals()['y_validate_'+str(i)].values), sep=' :' )

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

---------- 0 fold ----------
fin score :3.6716416514523296
---------- 1 fold ----------
fin score :3.680018198965752
---------- 2 fold ----------
fin score :3.669020747529061


In [122]:
%%time
ela = ElasticNet()
lasso = Lasso()
ridge = Ridge()
linear = LinearRegression()
for i in tqdm(range(3)):
    print('-'*10, str(i), 'fold', '-'*10)
    preds = []
    preds.append(locals()['lgb_pred_'+str(i)])
    preds.append(locals()['xgb_pred_'+str(i)])
    preds.append(locals()['cat_pred_'+str(i)])
    preds = np.array(preds).T
    print('train score', rmse(locals()['lgb_pred_'+str(i)], locals()['y_train_'+str(i)].values),
          rmse(locals()['xgb_pred_'+str(i)], locals()['y_train_'+str(i)].values),
          rmse(locals()['cat_pred_'+str(i)], locals()['y_train_'+str(i)].values), sep=' : ')

    ela.fit(preds, locals()['y_train_'+str(i)].values)
    lasso.fit(preds, locals()['y_train_'+str(i)].values)
    ridge.fit(preds, locals()['y_train_'+str(i)].values)
    linear.fit(preds, locals()['y_train_'+str(i)].values)
    
    validate = []
    validate.append(locals()['lgb_validate_'+str(i)])
    validate.append(locals()['xgb_validate_'+str(i)])
    validate.append(locals()['cat_validate_'+str(i)])
    validate = np.array(validate).T
    print('validate score', rmse(locals()['lgb_validate_'+str(i)], locals()['y_validate_'+str(i)].values),
          rmse(locals()['xgb_validate_'+str(i)], locals()['y_validate_'+str(i)].values),
          rmse(locals()['cat_validate_'+str(i)], locals()['y_validate_'+str(i)].values), sep=' : ')
    
    ela_pred = ela.predict(validate)
    lasso_pred = ela.predict(validate)
    ridge_pred = ela.predict(validate)
    linear_pred = ela.predict(validate)
    
    ela_score = rmse(ela_pred, locals()['y_validate_'+str(i)].values)
    lasso_score = rmse(lasso_pred, locals()['y_validate_'+str(i)].values)
    ridge_score = rmse(ridge_pred, locals()['y_validate_'+str(i)].values)
    linear_score = rmse(linear_pred, locals()['y_validate_'+str(i)].values)
    print('linear score', ela_score, lasso_score, ridge_score, linear_score, sep=' : ')
    
    total_score = 40-ela_score-lasso_score-ridge_score-linear_score
    fin_pred = (ela_pred*(10-ela_score)+lasso_pred*(10-lasso_score)+ridge_pred*(10-ridge_score)+linear_pred*(10-linear_score))/total_score
    print('fin score', rmse(fin_pred, locals()['y_validate_'+str(i)].values), sep=' : ')
    
    fin_pred = (ela_pred+lasso_pred+ridge_pred+linear_pred)/4
    print('fin score', rmse(fin_pred, locals()['y_validate_'+str(i)].values), sep=' : ')

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

---------- 0 fold ----------
train score : 3.1941357686678176 : 3.3569454649697255 : 3.4500180816402426
validate score : 3.6803091602683757 : 3.710736799020011 : 3.6755508261961336
linear score : 3.700116778790319 : 3.700116778790319 : 3.700116778790319 : 3.700116778790319
fin score : 3.700116778790319
fin score : 3.700116778790319
---------- 1 fold ----------
train score : 3.1921141030482763 : 3.3518444847242046 : 3.4601880103823457
validate score : 3.687976903054129 : 3.7150565571580563 : 3.6893007498241457
linear score : 3.709841499363369 : 3.709841499363369 : 3.709841499363369 : 3.709841499363369
fin score : 3.709841499363369
fin score : 3.709841499363369
---------- 2 fold ----------
train score : 3.2024400209302177 : 3.3556015899215033 : 3.46229405097289
validate score : 3.6751807783653216 : 3.7045732637133617 : 3.67669868972841
linear score : 3.6891150775985415 : 3.6891150775985415 : 3.6891150775985415 : 3.6891150775985415
fin score : 3.6891150775985415
fin score : 3.689115077598

## Testing

In [11]:
params = {'num_leaves': 935, 
          'max_depth': 245, 
          'learning_rate': 0.040630726310134826, 
          'num_estimators': 1436, 
          'subsample_for_bin': 6342, 
          'min_split_gain': 0.0004120715698725839, 
          'min_child_samples': 200, 
          'reg_labmda': 0.30042871016116973, 
          'drop_rate': 0.3708866605608058, 
          'boosting': 'gbdt',
    'objective': 'regression',
    'num_threads': 8,
    }
lgb_data = lgb.Dataset(train_trans[modelCols], label = train_trans['target'].values, categorical_feature=catCols)
bst = lgb.train(params, lgb_data)
lgb_train_pred = bst.predict(train_trans[modelCols])
lgb_test_pred = bst.predict(test_trans[modelCols])
print('lgb socres', rmse(lgb_train_pred, train_trans['target'].values), sep=' : ')



lgb socres : 3.1766619527037068


In [12]:
%%time
model = CatBoostRegressor(iterations=878, 
                          learning_rate=0.042356140476210384,
                        silent=True,
                        early_stopping_rounds=10,
                        loss_function='RMSE',
                        thread_count=8,)
train_pool = Pool(train_trans[modelCols], train_trans['target'].values, cat_features=catCols2)
test_pool = Pool(test_trans[modelCols], cat_features=catCols2) 
model.fit(train_pool, silent=True)
cat_train_pred = model.predict(train_pool)
cat_test_pred = model.predict(test_pool)
# print('cat socres : ', rmse(train_pool, train_trans['target'].values), sep=' : ') 

Wall time: 3min 20s


In [18]:
%%time
model = XGBRegressor(n_estimators=900,n_jobs=8,objective='reg:linear',)
model.fit(train_trans[modelCols], train_trans['target'].values)
xgb_train_pred = model.predict(train_trans[modelCols])
xgb_test_pred = model.predict(test_trans[modelCols])
print('socres : ', rmse(xgb_train_pred, train_trans['target'].values), sep=':')

socres : :3.461744869726393
Wall time: 9min 35s


## outlier

In [20]:
params = {'num_leaves': 10, 
          'max_depth': 194, 
          'learning_rate': 0.04398858791379669, 
          'num_estimators': 757, 
          'subsample_for_bin': 9697, 
          'min_split_gain': 0.0003616307235817724, 
          'min_child_samples': 50, 
#           'reg_labmda': 0.30042871016116973, 
          'drop_rate': 0.3856214959567157, 
          'boosting': 'gbdt',
    'objective': 'binary',
    'num_threads': 8,
    }
lgb_data = lgb.Dataset(train_trans[modelCols], label = train_trans['outliers'].values, categorical_feature=catCols)
bst = lgb.train(params, lgb_data)
lgb_train_pred_outlier = bst.predict(train_trans[modelCols])
lgb_test_pred_outlier = bst.predict(test_trans[modelCols])
print('lgb socres', rmse(lgb_train_pred_outlier, train_trans['outliers'].values), sep=' : ')



lgb socres : 0.09733246894934568


In [30]:
%%time
model = XGBClassifier(n_estimators=700,n_jobs=8,objective='binary:logistic',)
model.fit(train_trans[modelCols], train_trans['outliers'].values)
xgb_train_pred_outlier = model.predict(train_trans[modelCols])
xgb_test_pred_outlier = model.predict(test_trans[modelCols])
print('socres : ', rmse(xgb_train_pred_outlier, train_trans['outliers'].values), sep=':')

socres : :0.0993498479611383
Wall time: 8min 3s


In [31]:
%%time
model = CatBoostClassifier(iterations=878, 
                          learning_rate=0.042356140476210384,
                        silent=True,
#                         early_stopping_rounds=10,
                        loss_function='Logloss',
                        thread_count=8,)
train_pool = Pool(train_trans[modelCols], train_trans['outliers'].values, cat_features=catCols2)
test_pool = Pool(test_trans[modelCols], cat_features=catCols2) 
model.fit(train_pool, silent=True)
cat_train_pred_outlier = model.predict(train_pool)
cat_test_pred_outlier = model.predict(test_pool)
# print('cat socres : ', rmse(train_pool, train_trans['target'].values), sep=' : ') 

Wall time: 3min 37s


## Pure

In [54]:
params = {'num_leaves': 236, 
          'max_depth': 348, 
          'learning_rate': 0.051166673097082824, 
          'num_estimators': 1315, 
          'subsample_for_bin': 18011, 
          'min_split_gain': 0.0008656977198460191, 
          'min_child_samples': 165, 
          'reg_labmda': 0.16127074250244233, 
          'drop_rate': 0.49499130640699673, 
          'boosting': 'gbdt',
    'objective': 'regression',
    'num_threads': 8,
    }
lgb_data = lgb.Dataset(train_trans[train_trans['outliers']==0][modelCols], label = train_trans[train_trans['outliers']==0]['target'].values, categorical_feature=catCols)
bst = lgb.train(params, lgb_data)
lgb_train_pred_pure = bst.predict(train_trans[train_trans['outliers']==0][modelCols])
lgb_test_pred_pure = bst.predict(test_trans[modelCols])
print('lgb socres', rmse(lgb_train_pred_pure, train_trans[train_trans['outliers']==0]['target'].values), sep=' : ')

lgb socres : 1.4239913408255898


In [55]:
%%time
model = CatBoostRegressor(iterations=1078, 
                          learning_rate=0.052356140476210384,
                        silent=True,
                        early_stopping_rounds=10,
                        loss_function='RMSE',
                        thread_count=8,)
train_pool = Pool(train_trans[train_trans['outliers']==0][modelCols], train_trans[train_trans['outliers']==0]['target'].values, cat_features=catCols2)
test_pool = Pool(test_trans[modelCols], cat_features=catCols2) 
model.fit(train_pool, silent=True)
cat_train_pred_pure = model.predict(train_pool)
cat_test_pred_pure = model.predict(test_pool)
# print('cat socres : ', rmse(train_pool, train_trans['target'].values), sep=' : ') 

Wall time: 4min 11s


In [56]:
%%time
model = XGBRegressor(n_estimators=1100,n_jobs=8,objective='reg:linear',)
model.fit(train_trans[train_trans['outliers']==0][modelCols], train_trans[train_trans['outliers']==0]['target'].values)
xgb_train_pred_pure = model.predict(train_trans[train_trans['outliers']==0][modelCols])
xgb_test_pred_pure = model.predict(test_trans[modelCols])
print('socres : ', rmse(xgb_train_pred_pure, train_trans[train_trans['outliers']==0]['target'].values), sep=':')

socres : :1.5205947145641507
Wall time: 11min 41s


In [None]:
%%time
rf1 = RandomForestRegressor(n_estimators=1000, n_jobs=8,)

rf1.fit(train_trans[modelCols], train_trans['target'])

rf_train_pred = rf1.predict(train_trans[modelCols])
rf_test_pred = rf1.predict(test_trans[modelCols])

## outlier Blending

## 블랜딩 한 다음에 치환

In [127]:
test_pred = pd.DataFrame(test_trans['card_id'])
test_pred['pred'] = (lgb_test_pred+xgb_test_pred+cat_test_pred)/3
test_pred['pure'] = (lgb_test_pred_pure+xgb_test_pred_pure+cat_test_pred_pure)/3
test_pred['outlier'] = (lgb_test_pred_outlier+xgb_test_pred_outlier+cat_test_pred_outlier)/3

In [129]:
test_pred = test_pred.sort_values(by='outlier', ascending=False)

## 치환 한 다음에 블랜딩

In [194]:
def merge_pred(pred, outlier, pure):
    ret = pd.DataFrame(train_trans['card_id'])
    ret['pred'] = pred
    ret['outlier'] = outlier
    temp = pd.DataFrame(train_trans[train_trans['outliers']==0]['card_id'])
    temp['true'] = pure
    ret = ret.merge(temp, how='left', on='card_id')
#     ret['true'] = pure
    return ret

lgb_train_pred_all = merge_pred(lgb_train_pred, lgb_train_pred_outlier, lgb_train_pred_pure).sort_values(by='outlier', ascending=False)
xgb_train_pred_all = merge_pred(xgb_train_pred, xgb_train_pred_outlier, xgb_train_pred_pure).sort_values(by='outlier', ascending=False)
cat_train_pred_all = merge_pred(cat_train_pred, cat_train_pred_outlier, cat_train_pred_pure).sort_values(by='outlier', ascending=False)

# rf_train_pred_all = merge_pred(rf_train_pred, rf_train_pred_outlier, rf_train_pred_pure).sort_values(by='outlier', ascending=False)
# ela_train_pred_all = merge_pred(ela_train_pred, ela_train_pred_outlier, ela_train_pred_pure).sort_values(by='outlier', ascending=False)
# ridge_train_pred_all = merge_pred(ridge_train_pred, ridge_train_pred_outlier, ridge_train_pred_pure).sort_values(by='outlier', ascending=False)
# lasso_train_pred_all = merge_pred(lasso_train_pred, lasso_train_pred_outlier, lasso_train_pred_pure).sort_values(by='outlier', ascending=False)

def merge_pred(pred, outlier, pure):
    ret = pd.DataFrame(test_trans['card_id'])
    ret['pred'] = pred
    ret['outlier'] = outlier
    ret['true'] = pure
    return ret

lgb_test_pred_all = merge_pred(lgb_test_pred, lgb_test_pred_outlier, lgb_test_pred_pure).sort_values(by='outlier', ascending=False)
xgb_test_pred_all = merge_pred(xgb_test_pred, xgb_test_pred_outlier, xgb_test_pred_pure).sort_values(by='outlier', ascending=False)
cat_test_pred_all = merge_pred(cat_test_pred, cat_test_pred_outlier, cat_test_pred_pure).sort_values(by='outlier', ascending=False)

# rf_test_pred_all = merge_pred(rf_test_pred, rf_test_pred_outlier, rf_test_pred_pure).sort_values(by='outlier', ascending=False)
# ela_test_pred_all = merge_pred(ela_test_pred, ela_test_pred_outlier, ela_test_pred_pure).sort_values(by='outlier', ascending=False)
# ridge_test_pred_all = merge_pred(ridge_test_pred, ridge_test_pred_outlier, ridge_test_pred_pure).sort_values(by='outlier', ascending=False)
# lasso_test_pred_all = merge_pred(lasso_test_pred, lasso_test_pred_outlier, lasso_test_pred_pure).sort_values(by='outlier', ascending=False)


In [195]:
cut_off= int(train.shape[0]*.2)
lgb_train_pred_all['target'] = np.append(lgb_train_pred_all['pred'][:cut_off], lgb_train_pred_all['true'][cut_off:])
xgb_train_pred_all['target'] = np.append(xgb_train_pred_all['pred'][:cut_off], xgb_train_pred_all['true'][cut_off:])
cat_train_pred_all['target'] = np.append(cat_train_pred_all['pred'][:cut_off], cat_train_pred_all['true'][cut_off:])

# rf_train_pred_all['target'] = np.append(rf_train_pred_all['outlier'][:cut_off], rf_train_pred_all['pred'][cut_off:])
# ela_train_pred_all['target'] = np.append(ela_train_pred_all['outlier'][:cut_off], ela_train_pred_all['pred'][cut_off:])
# lasso_train_pred_all['target'] = np.append(lasso_train_pred_all['outlier'][:cut_off], lasso_train_pred_all['pred'][cut_off:])
# ridge_train_pred_all['target'] = np.append(ridge_train_pred_all['outlier'][:cut_off], ridge_train_pred_all['pred'][cut_off:])

cut_off= int(test.shape[0]*.2)
lgb_test_pred_all['target'] = np.append(lgb_test_pred_all['pred'][:cut_off], lgb_test_pred_all['true'][cut_off:])
xgb_test_pred_all['target'] = np.append(xgb_test_pred_all['pred'][:cut_off], xgb_test_pred_all['true'][cut_off:])
cat_test_pred_all['target'] = np.append(cat_test_pred_all['pred'][:cut_off], cat_test_pred_all['true'][cut_off:])

# rf_test_pred_all['target'] = np.append(rf_test_pred_all['outlier'][:cut_off], rf_test_pred_all['pred'][cut_off:])
# ela_test_pred_all['target'] = np.append(ela_test_pred_all['outlier'][:cut_off], ela_test_pred_all['pred'][cut_off:])
# lasso_test_pred_all['target'] = np.append(lasso_test_pred_all['outlier'][:cut_off], lasso_test_pred_all['pred'][cut_off:])
# ridge_test_pred_all['target'] = np.append(ridge_test_pred_all['outlier'][:cut_off], ridge_test_pred_all['pred'][cut_off:])

In [196]:
lgb_train_pred_all = lgb_train_pred_all.sort_values(by='card_id')
xgb_train_pred_all = xgb_train_pred_all.sort_values(by='card_id')
cat_train_pred_all = cat_train_pred_all.sort_values(by='card_id')

# rf_train_pred_all = rf_train_pred_all.sort_values(by='card_id')
# ela_train_pred_all = ela_train_pred_all.sort_values(by='card_id')
# lasso_train_pred_all = lasso_train_pred_all.sort_values(by='card_id')
# ridge_train_pred_all = ridge_train_pred_all.sort_values(by='card_id')

lgb_test_pred_all = lgb_test_pred_all.sort_values(by='card_id')
xgb_test_pred_all = xgb_test_pred_all.sort_values(by='card_id')
cat_test_pred_all = cat_test_pred_all.sort_values(by='card_id')

# rf_test_pred_all = rf_test_pred_all.sort_values(by='card_id')
# ela_test_pred_all = ela_test_pred_all.sort_values(by='card_id')
# lasso_test_pred_all = lasso_test_pred_all.sort_values(by='card_id')
# ridge_test_pred_all = ridge_test_pred_all.sort_values(by='card_id')

In [140]:
# ela = ElasticNet()
lasso = Lasso()
ridge = Ridge()
linear = LinearRegression()
      
preds = []
preds.append(lgb_train_pred_all['target'].values)
preds.append(xgb_train_pred_all['target'].values)
preds.append(cat_train_pred_all['target'].values)
# preds.append(rf_train_pred_all['target'].values)
# preds.append(ela_train_pred_all['target'].values)
# preds.append(lasso_train_pred_all['target'].values)
# preds.append(ridge_train_pred_all['target'].values)
preds = np.array(preds).T

# ela.fit(preds, train_trans['target'].values)
lasso.fit(preds, train_trans['target'].values)
ridge.fit(preds, train_trans['target'].values)
linear.fit(preds, train_trans['target'].values)
    
validate = []
validate.append(lgb_test_pred_all['target'].values)
validate.append(xgb_test_pred_all['target'].values)
validate.append(cat_test_pred_all['target'].values)
# validate.append(rf_test_pred_all['target'].values)
# validate.append(ela_test_pred_all['target'].values)
# validate.append(lasso_test_pred_all['target'].values)
# validate.append(ridge_test_pred_all['target'].values)
validate = np.array(validate).T
    
# ela_pred = ela.predict(validate)
lasso_pred = lasso.predict(validate)
ridge_pred = ridge.predict(validate)
linear_pred = linear.predict(validate)

# fin_pred1 = (ela_pred*ela_score+lasso_pred*lasso_score+ridge_pred*ridge_score+linear_pred*linear_score)/total_score
fin_pred2 = (lasso_pred+ridge_pred+linear_pred)/3
# fin1_submission = submission(fin_pred1)
# fin1_submission.to_csv(write_path+'20190227_linear_rate.csv', index=False)
# fin2_submission = submission(fin_pred2)
# fin2_submission.to_csv('../20190227_linear_uniform.csv', index=False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [201]:
validate = (lgb_test_pred_all['target'].values+xgb_test_pred_all['target'].values+cat_test_pred_all['target'].values)/3
# validate.append(rf_test_pred_all['target'].values)
# validate.append(ela_test_pred_all['target'].values)
# validate.append(lasso_test_pred_all['target'].values)
# validate.append(ridge_test_pred_all['target'].values)
# validate = np.array(validate).T

In [189]:
cut_off= int(test.shape[0]*.1)
test_pred['target'] = np.append(test_pred['pred'][:cut_off], test_pred['pure'][cut_off:])

In [65]:
def submission(pred):
    ret = pd.DataFrame(lgb_test_pred_all['card_id'])
    ret['target'] = pred
    return ret

In [202]:
test_pred2 = pd.DataFrame(test_trans['card_id'])
test_pred2['target'] = validate

In [204]:
sub_blend = test_pred2[['card_id', 'target']].reset_index(drop=True)
sub_blend.to_csv('../20190227_blend_fin.csv', index=False)