<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
import os
import datetime
import gc

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Edit data directory here
DATA_DIR = "./input/techjam"


def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

def get_prep_data(DATA_DIR):
    # Reading Files
    cc = pd.read_csv(os.path.join(DATA_DIR,'cc.csv'),parse_dates=['pos_dt'])
    demo = pd.read_csv(os.path.join(DATA_DIR,'demographics.csv'))
    kplus = pd.read_csv(os.path.join(DATA_DIR,'kplus.csv'),parse_dates=['sunday'])

    train = pd.read_csv(os.path.join(DATA_DIR,'train.csv'))
    test = pd.read_csv(os.path.join(DATA_DIR,'test.csv'))

    # Set-up
    cc_mapper = demo[['id','cc_no']].copy()
    demo = demo.drop('cc_no',axis=1).drop_duplicates().reset_index(drop=True)
    label = pd.concat([train,test],axis=0,ignore_index=True)
    demo = demo.merge(label, on='id')
    demo['ocp_cd'] = demo['ocp_cd'].fillna(0).astype(int)
    demo.set_index('id',inplace=True)
    kplus.set_index('id',inplace=True)
    joined_cc = cc.merge(cc_mapper, on='cc_no', how='inner').drop('cc_no', axis=1)

    kplus['month'] = kplus['sunday'].dt.month
    kplus['month'] = 'month'+ kplus['month'].astype(str)
    joined_cc['month'] = joined_cc.pos_dt.dt.month
    joined_cc['month'] = 'month'+ joined_cc['month'].astype(str)

    bank_holidays = ['2018-01-01','2018-01-02','2018-03-01','2018-04-06','2018-04-13',
                    '2018-04-14','2018-04-15','2018-04-16','2018-05-01','2018-05-29']
    joined_cc['is_holiday'] = joined_cc['pos_dt'].isin([datetime.datetime.strptime(i, '%Y-%m-%d') for i in bank_holidays]).astype(int)
    joined_cc['is_weekend'] = joined_cc['pos_dt'].dt.weekday.isin([0,6]).astype(int)
    joined_cc['is_holiday'] = 'holiday'+joined_cc['is_holiday'].astype(str)
    joined_cc['is_weekend'] = 'weekend'+joined_cc['is_weekend'].astype(str)
    joined_cc['quarter'] = 'q'+((joined_cc['pos_dt'].dt.month>=4)+1).astype(str)


    # Adding log 
    demo['income'] = demo['income'].apply(np.log1p)

   


    # Target Encoding
    demo = demo.reset_index()
    demo['cc_cnt'] = demo['id'].map(cc_mapper.groupby('id').cc_no.count())
    demo['has_kp'] = demo['id'].isin(kplus.index).astype(int)

    # Crossing categorical features as another feature [374 / 336]
    demo['age_gnd'] = demo['gender'].astype(str)+demo['age'].astype(str)
    demo['gnd_ocp'] = demo['gender'].astype(str)+demo['ocp_cd'].astype(str)
    demo['age_ocp'] = demo['age'].astype(str)+demo['ocp_cd'].astype(str)

    # Left age out of categorical features since it's ordinal
    categorical_features = ['gender','ocp_cd','age_gnd','gnd_ocp','age_ocp']

    # Target Encoding, code modified from [374]
    for feature in categorical_features + ['age']:
        means_per_group = demo[demo['income']>0].groupby(feature)['income'].mean()
        demo[feature+'_mean'] = demo[feature].map(means_per_group)

        count_per_group = demo[demo['income']>0].groupby(feature)['income'].count()
        demo[feature+'_count'] = demo[feature].map(count_per_group)
    demo.set_index('id',inplace=True)
    
     # Preping Training data
    train = demo.copy()

    # Normal Total Groupby
    kplus_tot = kplus.groupby('id').agg({'kp_txn_count':'sum','kp_txn_amt':'sum'}).copy()
    kplus_mm_tot = kplus.groupby(['id','month']).agg({'kp_txn_count':'sum','kp_txn_amt':'sum'}).unstack(level=1).copy()
    kplus_mm_tot.columns = ['_'.join([str(c) for c in lst]) for lst in kplus_mm_tot.columns]

    # CreditCard Total Groupby
    cc_tot = joined_cc.groupby('id').agg({'cc_txn_amt':['count','sum']}).copy()
    cc_tot.columns = ['_'.join(i) for i in cc_tot.columns]

    # CreditCard Monthly Groupby
    combined_cc = pd.pivot_table(joined_cc, index= 'id', columns= 'month', values= 'cc_txn_amt', aggfunc= [np.mean, min, max, np.sum, 'count', np.var, percentile(10), percentile(90)])
    combined_cc.columns = ['cc_'+'_'.join([str(c) for c in lst]) for lst in combined_cc.columns]


    # CreditCard Pompus Features
    combined_cc_holiday = pd.pivot_table(joined_cc, index= 'id', columns= 'is_holiday', values= 'cc_txn_amt', aggfunc= [np.mean, min, max, np.sum, 'count', np.var, percentile(10), percentile(90)])
    combined_cc_weekend = pd.pivot_table(joined_cc, index= 'id', columns= 'is_weekend', values= 'cc_txn_amt', aggfunc= [np.mean, min, max, np.sum, 'count', np.var, percentile(10), percentile(90)])
    combined_cc_quarter = pd.pivot_table(joined_cc, index= 'id', columns= 'quarter', values= 'cc_txn_amt', aggfunc= [np.mean, min, max, np.sum, 'count', np.var, percentile(10), percentile(90)])
    combined_cc_holiday.columns = ['cc_'+'_'.join([str(c) for c in lst]) for lst in combined_cc_holiday.columns]
    combined_cc_weekend.columns = ['cc_'+'_'.join([str(c) for c in lst]) for lst in combined_cc_weekend.columns]
    combined_cc_quarter.columns = ['cc_'+'_'.join([str(c) for c in lst]) for lst in combined_cc_quarter.columns]

    # Joining all together
    train = train.join(kplus_tot).join(kplus_mm_tot).join(cc_tot).join(combined_cc).join(combined_cc_holiday).join(combined_cc_weekend).join(combined_cc_quarter).fillna(0)

    # Quick Normalization
#     for col in [ i for i in train.columns if i not in ['id','gender','ocp_cd','age','income']]:
#         train[col] = StandardScaler().fit_transform(train[col].values.reshape(-1, 1))
#         train[col] = train[col].fillna(0)
#         if 'amt' in col:
#             train[col] = train[col].apply(np.log1p)

    X_train = train[train['income']>0].drop('income',axis=1).copy()
    y_train = pd.DataFrame(train[train['income']>0]['income']).copy()
    X_test = train[train['income']<=0].drop('income',axis=1).copy()

    return X_train, y_train, X_test

In [None]:
X_train, y_train, X_test =  get_prep_data(DATA_DIR)

In [None]:
cat_feature = ['gender','ocp_cd','age_gnd','gnd_ocp','age_ocp', 'age']

In [None]:
def techjam_score(y_pred, y_true):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    return 100 - 100 * np.mean((y_pred-y_true) ** 2 / (np.minimum(2*y_true, y_pred) + y_true)**2)

In [None]:
def techjam_feval_log(y_pred, dtrain):
    y_true = dtrain.get_label()
    return 'techjam_score', techjam_score(np.exp(y_pred), np.exp(y_true)), True

In [None]:
for cat in cat_feature:
    X_test[cat] =X_test[cat].astype(int)
    X_train[cat] =X_train[cat].astype(int)

train_data = lightgbm.Dataset(X_train, label=y_train, categorical_feature=cat_feature , free_raw_data=False)

num_leaves_choices = [15, 31, 63, 127, 200, 255, 300, 350, 400,511 ,600]
ft_frac_choices = [0.6, 0.7, 0.8, 0.9, 1.0]
bagging_frac_choices = [0.6, 0.7, 0.8, 0.9, 1.0]

# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []


for num_lv in tqdm_notebook(num_leaves_choices):
    for bg_fac in bagging_frac_choices:
        for ft_fac in ft_frac_choices:
            hyperparams = {"boosting_type":'gbdt',
                            "objective": 'mape',
                            "metrics": 'None',
                            "num_leaves": num_lv,
                            "feature_fraction": ft_fac,
                            "bagging_fraction": bg_fac,
                            "learning_rate": 0.01
                                     }
            validation_summary = lightgbm.cv(hyperparams,
                                            train_data,
                                            num_boost_round=10000,
                                            nfold=5,
                                            feval=techjam_feval_log,
                                            stratified=False,
                                            shuffle=True,
                                            early_stopping_rounds=50,
                                            verbose_eval=10)
            
            optimal_num_trees = len(validation_summary["techjam_score-mean"])
            
            # to the hyperparameter dictionary:
            hyperparams["num_boost_round"] = optimal_num_trees

           # And we append results to cv_results:
            cv_results.append((hyperparams, validation_summary["techjam_score-mean"][-1]))

In [None]:
sort_cv_result = sorted(cv_results, key=lambda tup:tup[1])

In [None]:
sort_cv_result[-1]

In [None]:
#select parameter score > 92.21

In [None]:
### select best 10 models
MODELS=[] 
for params_and_score in tqdm_notebook(sort_cv_result[-10:]):
    params = params_and_score[0]
    model = lightgbm.train(params,
                train_data,
               )
    MODELS.append(model)

In [None]:
### ensemble 10 models 
pred = []
for model in MODELS:
    y_pred = model.predict(X_test)
    y_pred = np.exp(y_pred)
    pred.append(y_pred)

In [None]:
pred=np.array(pred)
# perform ensemble
final_pred = pred.mean(axis=0)


In [None]:
### Create submission dataframe
submission = pd.DataFrame()
submission['id'] = [i for i in range(50001,65001)] 
submission['final_pred'] = final_pred