## Import library

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import random

import warnings
warnings.filterwarnings("ignore")

## Import csv

In [2]:
train = pd.read_csv("../data/train/train.csv")
# train.tail()
print(train.shape )
# train.shape #(52560, 9)

submission = pd.read_csv("../data/sample_submission.csv")
# submission.tail()
print(submission.shape )
# submission.shape #(7776, 10)

(52560, 9)
(7776, 10)


In [9]:
def preprocess_data(data, is_train=True):
    
    temp = data.copy()
    temp = temp[['Hour', 'Minute', 'TARGET', 'DHI', 'DNI', 'WS', 'RH', 'T']]
    
    temp['Time_sin'] = np.sin(( 2*np.pi* (2*temp['Hour']+temp['Minute'].apply(lambda x: bool(x))- 12) / 48 ))
#     temp['Time_cos'] = np.cos(( 2*np.pi* (2*temp['Hour']+temp['Minute'].apply(lambda x: bool(x))) / 48 ))
#     temp = temp.drop(['Hour', 'Minute'], axis = 1)
    temp = temp.drop(['Minute'], axis = 1)
    
#     temp['new1'] = temp["WS"] * temp["T"]    
    if is_train==True:          
    
        temp['Target1'] = temp['TARGET'].shift(-48).fillna(method='ffill')
        temp['Target2'] = temp['TARGET'].shift(-96).fillna(method='ffill')
        temp = temp.dropna()
        
        # 타킷 컬럼 끝으로
        temp_t1 = temp['Target1']
        temp = temp.drop(['Target1'], axis = 1)
        temp['Target1'] = temp_t1
        temp_t2 = temp['Target2']
        temp = temp.drop(['Target2'], axis = 1)
        temp['Target2'] = temp_t1
        
        return temp.iloc[:-96]
    
    elif is_train==False:        
        
        return temp.iloc[-48:, :]


df_train = preprocess_data(train)
# df_train.shape #(52464, 9) # 52560 - 52464 = 96
# df_train.iloc[:48]

In [12]:
df_test = [] # 리스트

for i in range(81):
    file_path = '../data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path) # 데이터 프레임 shape = (336, 9)
    temp = preprocess_data(temp, is_train=False) # 전처리  # 데이터 프레임 shape = (48, 7)
    df_test.append(temp) # 리스트에 전처리된 데이터 프레임 append

X_test = pd.concat(df_test) # 전처리된 데이터 프레임들 세로 병합
print(X_test.shape)
# X_test.shape # (3888, 7)

(3888, 8)


In [8]:
from sklearn.model_selection import train_test_split # 데이터셋 분리
# Target1, test_size 30%, random_state=0
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -2], test_size=0.3, random_state=0)
# Target2, test_size 30%, random_state=0
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], test_size=0.3, random_state=0)

print(X_train_1.shape)
print(X_valid_1.shape)
print(Y_train_1.shape)
print(Y_valid_1.shape)
# X_train_1.shape # (36724, 7)
# X_valid_1.shape # (15740, 7)
# Y_train_1.shape # (36724,)
# Y_valid_1.shape # (15740,)

(36724, 8)
(15740, 8)
(36724,)
(15740,)


In [13]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [22]:
from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.0165)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=700, verbose=1000)

    # (b) Predictions
    # X_test shape = (3888,7)
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [23]:
# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    # quantile별로 모델 생성
    for q in quantiles:
        print(q) # q는 0.1부터 0.9까지 9개
        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        # 모델 생성 시마다 모델 저장. 
        LGBM_models.append(model)
        # 예측할 때마다 가로 병합
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)

    LGBM_actual_pred.columns=quantiles
    # LGBM_actual_pred shape = 3888, 9
    return LGBM_models, LGBM_actual_pred

In [16]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)

0.1
Training until validation scores don't improve for 700 rounds
[500]	valid_0's quantile: 1.36288
[1000]	valid_0's quantile: 1.36204
[1500]	valid_0's quantile: 1.36254
Early stopping, best iteration is:
[928]	valid_0's quantile: 1.3618
0.2
Training until validation scores don't improve for 700 rounds
[500]	valid_0's quantile: 2.19308
[1000]	valid_0's quantile: 2.18805
[1500]	valid_0's quantile: 2.1844
[2000]	valid_0's quantile: 2.18341
[2500]	valid_0's quantile: 2.18242
[3000]	valid_0's quantile: 2.17856
[3500]	valid_0's quantile: 2.17844
[4000]	valid_0's quantile: 2.17716
[4500]	valid_0's quantile: 2.17694
[5000]	valid_0's quantile: 2.17689
[5500]	valid_0's quantile: 2.17511
[6000]	valid_0's quantile: 2.17511
[6500]	valid_0's quantile: 2.17367
[7000]	valid_0's quantile: 2.17256
[7500]	valid_0's quantile: 2.17215
[8000]	valid_0's quantile: 2.17259
Early stopping, best iteration is:
[7454]	valid_0's quantile: 2.17181
0.3
Training until validation scores don't improve for 700 rounds
[5

In [17]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)

0.1
Training until validation scores don't improve for 700 rounds
[500]	valid_0's quantile: 1.36288
[1000]	valid_0's quantile: 1.36204
[1500]	valid_0's quantile: 1.36254
Early stopping, best iteration is:
[928]	valid_0's quantile: 1.3618
0.2
Training until validation scores don't improve for 700 rounds
[500]	valid_0's quantile: 2.19308
[1000]	valid_0's quantile: 2.18805
[1500]	valid_0's quantile: 2.1844
[2000]	valid_0's quantile: 2.18341
[2500]	valid_0's quantile: 2.18242
[3000]	valid_0's quantile: 2.17856
[3500]	valid_0's quantile: 2.17844
[4000]	valid_0's quantile: 2.17716
[4500]	valid_0's quantile: 2.17694
[5000]	valid_0's quantile: 2.17689
[5500]	valid_0's quantile: 2.17511
[6000]	valid_0's quantile: 2.17511
[6500]	valid_0's quantile: 2.17367
[7000]	valid_0's quantile: 2.17256
[7500]	valid_0's quantile: 2.17215
[8000]	valid_0's quantile: 2.17259
Early stopping, best iteration is:
[7454]	valid_0's quantile: 2.17181
0.3
Training until validation scores don't improve for 700 rounds
[5

In [18]:
print(results_1.shape, results_2.shape)#(3888, 9) (3888, 9)

(3888, 9) (3888, 9)


In [19]:
print(submission.shape)

(7776, 10)


In [20]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
submission.to_csv('../submission/submission_LGBM_mybase.csv', index=False)