<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
import os
import datetime
import gc

import numpy as np
import pandas as pd
import lightgbm
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook
### feature engineer part
import techjam_fe


In [None]:
# Edit data directory here
DATA_DIR = ".\\techjam"

In [None]:
X_train, y_train, X_test =  get_prep_data(DATA_DIR)

In [None]:
cat_feature = ['gender','ocp_cd','age_gnd','gnd_ocp','age_ocp', 'age']

In [None]:
def techjam_score(y_pred, y_true):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    return 100 - 100 * np.mean((y_pred-y_true) ** 2 / (np.minimum(2*y_true, y_pred) + y_true)**2)

In [None]:
def techjam_feval_log(y_pred, dtrain):
    y_true = dtrain.get_label()
    return 'techjam_score', techjam_score(np.exp(y_pred), np.exp(y_true)), True

In [None]:
for cat in cat_feature:
    X_test[cat] =X_test[cat].astype(int)
    X_train[cat] =X_train[cat].astype(int)

train_data = lightgbm.Dataset(X_train, label=y_train, categorical_feature=cat_feature , free_raw_data=False)

num_leaves_choices = [15, 31, 63, 127, 200, 255, 300, 350, 400,511 ,600]
ft_frac_choices = [0.6, 0.7, 0.8, 0.9, 1.0]
bagging_frac_choices = [0.6, 0.7, 0.8, 0.9, 1.0]

# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []


for num_lv in tqdm_notebook(num_leaves_choices):
    for bg_fac in bagging_frac_choices:
        for ft_fac in ft_frac_choices:
            hyperparams = {"boosting_type":'gbdt',
                            "objective": 'mape',
                            "metrics": 'None',
                            "num_leaves": num_lv,
                            "feature_fraction": ft_fac,
                            "bagging_fraction": bg_fac,
                            "learning_rate": 0.01
                                     }
            validation_summary = lightgbm.cv(hyperparams,
                                            train_data,
                                            num_boost_round=10000,
                                            nfold=5,
                                            feval=techjam_feval_log,
                                            stratified=False,
                                            shuffle=True,
                                            early_stopping_rounds=50,
                                            verbose_eval=10)
            
            optimal_num_trees = len(validation_summary["techjam_score-mean"])
            
            # to the hyperparameter dictionary:
            hyperparams["num_boost_round"] = optimal_num_trees

           # And we append results to cv_results:
            cv_results.append((hyperparams, validation_summary["techjam_score-mean"][-1]))

In [None]:
sort_cv_result = sorted(cv_results, key=lambda tup:tup[1])

In [None]:
sort_cv_result[-1]

In [None]:
#select parameter score > 92.21

In [None]:
### select best 10 models
MODELS=[] 
for params_and_score in tqdm_notebook(sort_cv_result[-10:]):
    params = params_and_score[0]
    model = lightgbm.train(params,
                train_data,
               )
    MODELS.append(model)

In [None]:
### ensemble 10 models 
pred = []
for model in MODELS:
    y_pred = model.predict(X_test)
    y_pred = np.exp(y_pred)
    pred.append(y_pred)

In [None]:
pred=np.array(pred)
# perform ensemble
final_pred = pred.mean(axis=0)


In [None]:
### Create submission dataframe
submission = pd.DataFrame()
submission['id'] = [i for i in range(50001,65001)] 
submission['final_pred'] = final_pred