# TEST LIGHT GBM REGRESSOR

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMRegressor
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc
import os
import pickle

In [2]:
import warnings
warnings.filterwarnings('ignore')

### FUNCTIONS

In [3]:
def on_kaggle():
    return "KAGGLE_KERNEL_RUN_TYPE" in os.environ

In [4]:
def plot_importance(booster, figsize=(10,5), feature_names=None, **kwargs): 
    
    fig, ax = plt.subplots(1,1,figsize=figsize) 
    
    if feature_names:
        mapper = {'f{0}'.format(i): v for i, v in enumerate(feature_names)}
        mapped = {mapper[k]: v for k, v in booster.get_fscore().items()}
        return xgb.plot_importance(mapped, ax=ax, **kwargs)
    else:
        return xgb.plot_importance(booster=booster, ax=ax, **kwargs)
    pass

In [5]:
def make_submission(test, submission):
    preds = test[["id", "date", "demand"]]
    preds = preds.pivot(index="id", columns="date", values="demand").reset_index()
    preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)]

    vals = submission[["id"]].merge(preds, how="inner", on="id")
    evals = submission[submission["id"].str.endswith("evaluation")]
    final = pd.concat([vals, evals])

    assert final.drop("id", axis=1).isnull().sum().sum() == 0
    assert final["id"].equals(submission["id"])

    if on_kaggle():
        final.to_csv("submission.csv", index=False)
    else:
        final.to_csv(f"{OUTPUT_PATH}/{OUTPUT_NAME}.csv", index=False)

In [6]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

### GLOBAL VARIABLES

In [9]:
INPUT_PATH = '../../../data'
OUTPUT_PATH = '../../../data/submission'
MODEL_PATH = '../../../models/light_gbm'
FEATURES_PATH = '../../../data/features'
FEATURES_NAME = 'feateng_001'
OUTPUT_NAME = 'submission_001'
MODEL_NAME = 'lgb_reg_0001'
SEED = 47
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False

### LOAD DATA

In [19]:
X_test =  np.load(f'{INPUT_PATH}/train_test/X_test.npy')

In [22]:
id_date = pd.read_pickle(f'{INPUT_PATH}/train_test/id_date.pkl')

In [None]:
submission = pd.read_csv(f'{INPUT_PATH}/raw/sample_submission.csv')

In [10]:
features = pd.read_csv(f'{INPUT_PATH}/features/features.csv', index_col=False)[['features']].tolist()

In [11]:
X_test = X_test[features]

In [14]:
X_test = lgb.Dataset(X_test)

### LOAD MODEL

In [29]:
models = pickle.load(open(f'{MODEL_PATH}/{MODEL_NAME}.sav', 'rb'))

In [34]:
print(models)

{'bagging_fraction': 0.8327365806346091,
 'feature_fraction': 0.8881760200078465,
 'max_depth': 13,
 'min_child_weight': 30,
 'min_split_gain': 0.004145814603445978,
 'num_leaves': 99,
 'n_estimators': 344,
 'seed': 47,
 'booster': 'gbdt',
 'objective': 'binary',
 'scale_pos_weight': 9.22705413575158,
 'num_threads': -1}

### SCORING

In [None]:
nmodels = models.shape[0]

In [None]:
imp_type = "gain"
importances = np.zeros(X_test.shape[1])
y_pred = np.zeros(X_test.shape[0])

for model in models:
    y_pred += model.predict(X_test)
    importances += model.feature_importance(imp_type)

y_pred = y_pred / nmodels
importances = importances / nmodels

### SAVE RESULTS

In [None]:
make_submission(id_date.assign(demand=preds), submission)    