In [65]:
import warnings
from pandas.core.common import SettingWithCopyWarning
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import make_scorer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train/train.csv')

In [3]:
df

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0
4,0,2,0,0,0,1.6,75.20,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21,30,0,0,2.4,70.70,-4,0.0
52556,1094,22,0,0,0,2.4,66.79,-4,0.0
52557,1094,22,30,0,0,2.2,66.78,-4,0.0
52558,1094,23,0,0,0,2.1,67.72,-4,0.0


### Sampling

In [5]:
X_train = df.drop(['Minute', 'Hour', 'Day', 'TARGET'], axis=1)
y_train = df['TARGET']

X_train = X_train[:-96]
y_train = y_train[96:]

## Modeling Grid_Search

In [6]:
def pinball_loss(y_true, y_pred, tau=0.5):
    return np.nanmean((y_pred >= y_true) * (1 - tau) * (y_pred - y_true) + (y_pred < y_true) * tau * (y_true - y_pred))

pinball_scorer = make_scorer(pinball_loss, greater_is_better=False)

param = {
    'max_depth':[5,6],
    'n_estimators':[10000],
    'colsample_bytree':[0.7],
    'colsample_bylevel':[0.7],
}
model = xgb.XGBRegressor(max_depth=7)

grid_search = GridSearchCV(estimator=model, param_grid=param, 
                           scoring=pinball_scorer,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [61]:
pred_train = grid_search.predict(X_train)

print('Pinball Loss score: ', pinball_loss(y_train, pred_train))

Pinball Loss score:  2.017757684194703


# Predict

In [58]:
X_test = pd.read_csv('test/0.csv')

X_test = X_test.drop(['Minute', 'Hour', 'Day', 'TARGET'], axis=1)
X_test = X_test[336-96:]

pred_test = grid_search.predict(X_test)

for i in range(len(pred_test)):
    if pred_test[i] < 0 :
        pred_test[i] = 0

In [116]:
def pinball_loss(y_true, y_pred, tau):
    return np.nanmean((y_pred >= y_true) * (1 - tau) * (y_pred - y_true) + (y_pred < y_true) * tau * (y_true - y_pred))

def build_model(tau):
    pinball_scorer = make_scorer(pinball_loss, greater_is_better=False)
    
    print("Tau : ", tau)    
    
    param = {
        'max_depth':[6],
        'n_estimators':[10000],
        'colsample_bytree':[0.7],
        'colsample_bylevel':[0.7],
    }
    model = xgb.XGBRegressor(max_depth=7)

    grid_search = GridSearchCV(estimator=model, param_grid=param, 
                               scoring=pinball_scorer,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_estimator_)
    
    return grid_search

In [113]:
def do_predict(model):     
    dfs = []
    outs = []
    
    X_test = pd.read_csv('test/0.csv')
    X_test = X_test.drop(['Minute', 'Hour', 'Day', 'TARGET'], axis=1)
    X_test = X_test[336-96:]

    for i in range(81):
        data = pd.read_csv(f"test/{i}.csv")
        data = data.drop(['Minute', 'Hour', 'Day', 'TARGET'], axis=1)
        data = data[336-96:]
        
        pred = model.predict(data)
        
        outs.append(pred)
        
    return np.array(outs).flatten()

In [114]:
models = []
taus = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in range(len(taus)):
    model = build_model(taus[i])
    models.append(model)

Tau :  0.1
{'colsample_bylevel': 0.7, 'colsample_bytree': 1, 'max_depth': 4, 'n_estimators': 300}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
Tau :  0.2
{'colsample_bylevel': 0.7, 'colsample_bytree': 1, 'max_depth': 4, 'n_estimators': 300}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learnin

In [115]:
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

df_sample = pd.read_csv('sample_submission.csv')

columns = ['q_0.1', 'q_0.2', 'q_0.3', 'q_0.4', 'q_0.5', 'q_0.6', 'q_0.7', 'q_0.8', 'q_0.9']

for i in range(len(columns)):
    out = do_predict(models[i])
    df_sample[columns[i]] = out
    
df_sample.to_csv(f'XGBoost_Second_Try_Tiny_change.csv', index=False)

In [None]:
for h in hiss:
    plt.plot(h.history['loss'])
    plt.plot(h.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()