## 사용할 패키지

In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
from lightgbm import LGBMClassifier

In [2]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

## 데이터 불러오기

In [3]:
site = pd.read_csv('data/site_info.csv')
energy = pd.read_csv('data/energy.csv')
dangjin_fcst = pd.read_csv('data/dangjin_fcst_wind_time.csv')
ulsan_fcst = pd.read_csv('data/ulsan_fcst_wind_time.csv')

In [4]:
energy

Unnamed: 0,time,dangjin_floating,dangjin_warehouse,dangjin,ulsan
0,2018-03-01 1:00:00,0.0,0.0,0,0
1,2018-03-01 2:00:00,0.0,0.0,0,0
2,2018-03-01 3:00:00,0.0,0.0,0,0
3,2018-03-01 4:00:00,0.0,0.0,0,0
4,2018-03-01 5:00:00,0.0,0.0,0,0
...,...,...,...,...,...
25627,2021-01-31 20:00:00,0.0,0.0,0,0
25628,2021-01-31 21:00:00,0.0,0.0,0,0
25629,2021-01-31 22:00:00,0.0,0.0,0,0
25630,2021-01-31 23:00:00,0.0,0.0,0,0


In [5]:
dangjin_fcst

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,-2.000000,55.000000,6.700000,336.000000,1.0,6.120755,-2.725136,-7.071068e-01,-7.071068e-01,0.858280,0.513182
1,2018-03-02 01:00:00,-2.333333,55.000000,6.133333,337.000000,1.0,5.645763,-2.396484,-8.660254e-01,-5.000000e-01,0.858648,0.512566
2,2018-03-02 02:00:00,-2.666667,55.000000,5.566667,338.000000,1.0,5.161323,-2.085310,-9.659258e-01,-2.588190e-01,0.859015,0.511951
3,2018-03-02 03:00:00,-3.000000,55.000000,5.000000,339.000000,1.0,4.667902,-1.791840,-1.000000e+00,-3.848660e-12,0.859382,0.511335
4,2018-03-02 04:00:00,-3.333333,56.666667,4.700000,343.666667,1.0,4.510317,-1.321758,-9.659258e-01,2.588190e-01,0.859748,0.510719
...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2021-03-01 19:00:00,5.333333,83.333333,7.466667,20.333333,4.0,7.001396,2.594527,5.000000e-01,-8.660254e-01,0.858846,0.512234
26300,2021-03-01 20:00:00,4.666667,81.666667,7.333333,23.666667,4.0,6.716573,2.943710,2.588190e-01,-9.659258e-01,0.859213,0.511618
26301,2021-03-01 21:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,4.907520e-12,-1.000000e+00,0.859579,0.511002
26302,2021-03-01 22:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,-2.588190e-01,-9.659258e-01,0.859945,0.510386


In [6]:
ulsan_fcst.head()

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,0.0,40.0,1.5,318.0,2.0,1.114717,-1.003696,-0.707107,-0.7071068,0.85828,0.513182
1,2018-03-02 01:00:00,0.0,39.444444,1.511111,318.222222,1.888889,1.126888,-1.006768,-0.866025,-0.5,0.858648,0.512566
2,2018-03-02 02:00:00,0.0,38.888889,1.522222,318.444444,1.777778,1.139098,-1.00976,-0.965926,-0.258819,0.859015,0.511951
3,2018-03-02 03:00:00,0.0,38.333333,1.533333,318.666667,1.666667,1.151349,-1.012673,-1.0,-3.84866e-12,0.859382,0.511335
4,2018-03-02 04:00:00,0.0,37.777778,1.544444,318.888889,1.555556,1.16364,-1.015505,-0.965926,0.258819,0.859748,0.510719


기상 예보는 전날 14시(오후 2시) 예보를 선형보간하여 사용하였습니다.

※ 기상 예보 전치리 과정 : https://dacon.io/competitions/official/235720/codeshare/2499?page=1&dtype=recent

In [7]:
def transform_wind_to_vector(df_):
    df = df_.copy()

    wv = df['WindSpeed']

    # Convert to radians.
    wd_rad = df['WindDirection']*np.pi / 180

    # Calculate the wind x and y components.
    df['Wind_X'] = wv*np.cos(wd_rad)
    df['Wind_Y'] = wv*np.sin(wd_rad)
    
    return df

In [8]:
def transform_datetime_to_signal(df_):
    df = df_.copy()
    date_time = pd.to_datetime(df['Forecast_time'], format='%Y.%m.%d %H:%M')
    timestamp_s = date_time.map(datetime.datetime.timestamp)

    day = 24*60*60
    year = (365.2425)*day

    df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
    df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
    df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
    df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

    return df

In [9]:
dangjin_fcst = transform_wind_to_vector(dangjin_fcst)
dangjin_fcst = transform_datetime_to_signal(dangjin_fcst)

In [10]:
dangjin_fcst.head()

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,-2.0,55.0,6.7,336.0,1.0,6.120755,-2.725136,-0.707107,-0.7071068,0.85828,0.513182
1,2018-03-02 01:00:00,-2.333333,55.0,6.133333,337.0,1.0,5.645763,-2.396484,-0.866025,-0.5,0.858648,0.512566
2,2018-03-02 02:00:00,-2.666667,55.0,5.566667,338.0,1.0,5.161323,-2.08531,-0.965926,-0.258819,0.859015,0.511951
3,2018-03-02 03:00:00,-3.0,55.0,5.0,339.0,1.0,4.667902,-1.79184,-1.0,-3.84866e-12,0.859382,0.511335
4,2018-03-02 04:00:00,-3.333333,56.666667,4.7,343.666667,1.0,4.510317,-1.321758,-0.965926,0.258819,0.859748,0.510719


In [11]:
ulsan_fcst = transform_wind_to_vector(ulsan_fcst)
ulsan_fcst = transform_datetime_to_signal(ulsan_fcst)

In [12]:
ulsan_fcst.head()

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,0.0,40.0,1.5,318.0,2.0,1.114717,-1.003696,-0.707107,-0.7071068,0.85828,0.513182
1,2018-03-02 01:00:00,0.0,39.444444,1.511111,318.222222,1.888889,1.126888,-1.006768,-0.866025,-0.5,0.858648,0.512566
2,2018-03-02 02:00:00,0.0,38.888889,1.522222,318.444444,1.777778,1.139098,-1.00976,-0.965926,-0.258819,0.859015,0.511951
3,2018-03-02 03:00:00,0.0,38.333333,1.533333,318.666667,1.666667,1.151349,-1.012673,-1.0,-3.84866e-12,0.859382,0.511335
4,2018-03-02 04:00:00,0.0,37.777778,1.544444,318.888889,1.555556,1.16364,-1.015505,-0.965926,0.258819,0.859748,0.510719


In [13]:
dangjin_fcst

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,-2.000000,55.000000,6.700000,336.000000,1.0,6.120755,-2.725136,-7.071068e-01,-7.071068e-01,0.858280,0.513182
1,2018-03-02 01:00:00,-2.333333,55.000000,6.133333,337.000000,1.0,5.645763,-2.396484,-8.660254e-01,-5.000000e-01,0.858648,0.512566
2,2018-03-02 02:00:00,-2.666667,55.000000,5.566667,338.000000,1.0,5.161323,-2.085310,-9.659258e-01,-2.588190e-01,0.859015,0.511951
3,2018-03-02 03:00:00,-3.000000,55.000000,5.000000,339.000000,1.0,4.667902,-1.791840,-1.000000e+00,-3.848660e-12,0.859382,0.511335
4,2018-03-02 04:00:00,-3.333333,56.666667,4.700000,343.666667,1.0,4.510317,-1.321758,-9.659258e-01,2.588190e-01,0.859748,0.510719
...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2021-03-01 19:00:00,5.333333,83.333333,7.466667,20.333333,4.0,7.001396,2.594527,5.000000e-01,-8.660254e-01,0.858846,0.512234
26300,2021-03-01 20:00:00,4.666667,81.666667,7.333333,23.666667,4.0,6.716573,2.943710,2.588190e-01,-9.659258e-01,0.859213,0.511618
26301,2021-03-01 21:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,4.907520e-12,-1.000000e+00,0.859579,0.511002
26302,2021-03-01 22:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,-2.588190e-01,-9.659258e-01,0.859945,0.510386


## 일사량 추가하기

In [14]:
def sind(degree):
    return np.sin(np.radians(degree))
def cosd(degree):
    return np.cos(np.radians(degree))
def arcsind(degree):
    return np.arcsin(np.radians(degree))
def arccosd(degree):
    return np.arccos(np.radians(degree))

In [15]:
def sunDeclination(day):                                    # 태양 적위
    return 23.45 * sind((360 / 365) * (day - 81))

In [16]:
def sunAltitude(latitude, declination, hour):               # 태양 고도
    l, d, H = latitude, declination, hour
    h = (sind(l) * sind(d)) + (cosd(l) * cosd(d) * cosd(H))
    h = np.degrees(np.arcsin(h))
    return h

In [17]:
def sunRiseSet(latitude, declination, longitude):           # 일출 일몰 시간
    lat, d, long = latitude, declination, longitude
    b = (-1 * sind(lat) * sind(d)) / (cosd(lat) * cosd(d))
    b = np.degrees(np.arccos(b)) / 15
    d = longitude / 15                                      # 경도에 따른 시간 보정 (GMT +9:00는 경도 135도) 
    r = 0.00625                                             # refraction_rate, 굴절에 따른 시간 보정 (약 9분)
    sunrise = 12 - b - d + r
    sunset = 12 + b - d - r
    return sunrise, sunset

In [18]:
def directInsolation(altitude):                             # 직사 일사량
    am = 1/cosd(90 - altitude)                              # airmass
    I = 1.353 * 0.7 ** (am ** 0.678)                        # direct insolation
    return I

In [19]:
tokyo_longitude = 135.0

dangjin_latitude = float(site.loc[site['Id'] == '당진태양광', 'Latitude'])
dangjin_longitude = float(site.loc[site['Id'] == '당진태양광', 'Longitude'])

ulsan_latitude = float(site.loc[site['Id'] == '울산태양광', 'Latitude'])
ulsan_longitude = float(site.loc[site['Id'] == '울산태양광', 'Longitude'])

dangjin_latitude, dangjin_longitude, ulsan_latitude, ulsan_longitude

(37.0507527, 126.5102993, 35.47765089999999, 129.380778)

In [20]:
dangjin_fcst

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos
0,2018-03-02 00:00:00,-2.000000,55.000000,6.700000,336.000000,1.0,6.120755,-2.725136,-7.071068e-01,-7.071068e-01,0.858280,0.513182
1,2018-03-02 01:00:00,-2.333333,55.000000,6.133333,337.000000,1.0,5.645763,-2.396484,-8.660254e-01,-5.000000e-01,0.858648,0.512566
2,2018-03-02 02:00:00,-2.666667,55.000000,5.566667,338.000000,1.0,5.161323,-2.085310,-9.659258e-01,-2.588190e-01,0.859015,0.511951
3,2018-03-02 03:00:00,-3.000000,55.000000,5.000000,339.000000,1.0,4.667902,-1.791840,-1.000000e+00,-3.848660e-12,0.859382,0.511335
4,2018-03-02 04:00:00,-3.333333,56.666667,4.700000,343.666667,1.0,4.510317,-1.321758,-9.659258e-01,2.588190e-01,0.859748,0.510719
...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2021-03-01 19:00:00,5.333333,83.333333,7.466667,20.333333,4.0,7.001396,2.594527,5.000000e-01,-8.660254e-01,0.858846,0.512234
26300,2021-03-01 20:00:00,4.666667,81.666667,7.333333,23.666667,4.0,6.716573,2.943710,2.588190e-01,-9.659258e-01,0.859213,0.511618
26301,2021-03-01 21:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,4.907520e-12,-1.000000e+00,0.859579,0.511002
26302,2021-03-01 22:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,-2.588190e-01,-9.659258e-01,0.859945,0.510386


In [21]:
dangjin_insola = pd.DataFrame(data=[sunDeclination(i//24) for i in range(365 * 24 * 4 + 24)], columns=['declination'])

dangjin_insola['sunrise'], dangjin_insola['sunset'] = sunRiseSet(dangjin_latitude, dangjin_insola['declination'], dangjin_longitude - tokyo_longitude)

# (00시는 -180°, 06시는 -90°, 12시는 0°, 18시는 +90°, 24시는 +180°)
dangjin_insola['altitude'] = sunAltitude(dangjin_latitude, dangjin_insola['declination'], [(i % 24) * 15 - 180 for i in range(365 * 24 * 4 + 24)]) 
dangjin_insola.loc[dangjin_insola['altitude'] < 0, 'altitude'] = 0

dangjin_insola['Insolation'] = directInsolation(dangjin_insola['altitude'])

dangjin_insola['Forecast_time'] = pd.date_range('2018-01-01 00:00:00', '2021-12-31 23:00:00', freq='H')

start = '2018-03-02 00:00:00'
end = '2021-03-01 23:00:00'

start_idx = dangjin_insola[dangjin_insola['Forecast_time']==start].index[0]
end_idx = dangjin_insola[dangjin_insola['Forecast_time']==end].index[0]

dangjin_insola = dangjin_insola.loc[start_idx:end_idx, :].copy()
dangjin_insola.index = range(dangjin_insola.shape[0])

dangjin_insola['Forecast_time'] = dangjin_insola['Forecast_time'].astype(str)

dangjin_insola

Unnamed: 0,declination,sunrise,sunset,altitude,Insolation,Forecast_time
0,-8.293705,6.993442,18.138518,0.0,0.0,2018-03-02 00:00:00
1,-8.293705,6.993442,18.138518,0.0,0.0,2018-03-02 01:00:00
2,-8.293705,6.993442,18.138518,0.0,0.0,2018-03-02 02:00:00
3,-8.293705,6.993442,18.138518,0.0,0.0,2018-03-02 03:00:00
4,-8.293705,6.993442,18.138518,0.0,0.0,2018-03-02 04:00:00
...,...,...,...,...,...,...
26299,-8.293705,6.993442,18.138518,0.0,0.0,2021-03-01 19:00:00
26300,-8.293705,6.993442,18.138518,0.0,0.0,2021-03-01 20:00:00
26301,-8.293705,6.993442,18.138518,0.0,0.0,2021-03-01 21:00:00
26302,-8.293705,6.993442,18.138518,0.0,0.0,2021-03-01 22:00:00


In [22]:
ulsan_insola = pd.DataFrame(data=[sunDeclination(i//24) for i in range(365 * 24 * 4 + 24)], columns=['declination'])

ulsan_insola['sunrise'], dangjin_insola['sunset'] = sunRiseSet(ulsan_latitude, ulsan_insola['declination'], ulsan_longitude - tokyo_longitude)

# (00시는 -180°, 06시는 -90°, 12시는 0°, 18시는 +90°, 24시는 +180°)
ulsan_insola['altitude'] = sunAltitude(ulsan_latitude, ulsan_insola['declination'], [(i % 24) * 15 - 180 for i in range(365 * 24 * 4 + 24)]) 
ulsan_insola.loc[ulsan_insola['altitude'] < 0, 'altitude'] = 0

ulsan_insola['Insolation'] = directInsolation(ulsan_insola['altitude'])

ulsan_insola['Forecast_time'] = pd.date_range('2018-01-01 00:00:00', '2021-12-31 23:00:00', freq='H')

start = '2018-03-02 00:00:00'
end = '2021-03-01 23:00:00'

start_idx = ulsan_insola[ulsan_insola['Forecast_time']==start].index[0]
end_idx = ulsan_insola[ulsan_insola['Forecast_time']==end].index[0]

ulsan_insola = ulsan_insola.loc[start_idx:end_idx, :].copy()
ulsan_insola.index = range(ulsan_insola.shape[0])

ulsan_insola['Forecast_time'] = ulsan_insola['Forecast_time'].astype(str)

ulsan_insola

Unnamed: 0,declination,sunrise,altitude,Insolation,Forecast_time
0,-8.293705,6.778422,0.0,0.0,2018-03-02 00:00:00
1,-8.293705,6.778422,0.0,0.0,2018-03-02 01:00:00
2,-8.293705,6.778422,0.0,0.0,2018-03-02 02:00:00
3,-8.293705,6.778422,0.0,0.0,2018-03-02 03:00:00
4,-8.293705,6.778422,0.0,0.0,2018-03-02 04:00:00
...,...,...,...,...,...
26299,-8.293705,6.778422,0.0,0.0,2021-03-01 19:00:00
26300,-8.293705,6.778422,0.0,0.0,2021-03-01 20:00:00
26301,-8.293705,6.778422,0.0,0.0,2021-03-01 21:00:00
26302,-8.293705,6.778422,0.0,0.0,2021-03-01 22:00:00


In [23]:
dangjin_fcst = pd.merge(dangjin_fcst, dangjin_insola, on='Forecast_time', how='outer')
ulsan_fcst = pd.merge(ulsan_fcst, ulsan_insola, on='Forecast_time', how='outer')
# dangjin_fcst
# ulsan_fcst

In [24]:
dangjin_fcst

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos,declination,sunrise,sunset,altitude,Insolation
0,2018-03-02 00:00:00,-2.000000,55.000000,6.700000,336.000000,1.0,6.120755,-2.725136,-7.071068e-01,-7.071068e-01,0.858280,0.513182,-8.293705,6.993442,17.189351,0.0,0.0
1,2018-03-02 01:00:00,-2.333333,55.000000,6.133333,337.000000,1.0,5.645763,-2.396484,-8.660254e-01,-5.000000e-01,0.858648,0.512566,-8.293705,6.993442,17.189351,0.0,0.0
2,2018-03-02 02:00:00,-2.666667,55.000000,5.566667,338.000000,1.0,5.161323,-2.085310,-9.659258e-01,-2.588190e-01,0.859015,0.511951,-8.293705,6.993442,17.189351,0.0,0.0
3,2018-03-02 03:00:00,-3.000000,55.000000,5.000000,339.000000,1.0,4.667902,-1.791840,-1.000000e+00,-3.848660e-12,0.859382,0.511335,-8.293705,6.993442,17.189351,0.0,0.0
4,2018-03-02 04:00:00,-3.333333,56.666667,4.700000,343.666667,1.0,4.510317,-1.321758,-9.659258e-01,2.588190e-01,0.859748,0.510719,-8.293705,6.993442,17.189351,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2021-03-01 19:00:00,5.333333,83.333333,7.466667,20.333333,4.0,7.001396,2.594527,5.000000e-01,-8.660254e-01,0.858846,0.512234,-8.293705,6.993442,17.189351,0.0,0.0
26300,2021-03-01 20:00:00,4.666667,81.666667,7.333333,23.666667,4.0,6.716573,2.943710,2.588190e-01,-9.659258e-01,0.859213,0.511618,-8.293705,6.993442,17.189351,0.0,0.0
26301,2021-03-01 21:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,4.907520e-12,-1.000000e+00,0.859579,0.511002,-8.293705,6.993442,17.189351,0.0,0.0
26302,2021-03-01 22:00:00,4.000000,80.000000,7.200000,27.000000,4.0,6.415247,3.268732,-2.588190e-01,-9.659258e-01,0.859945,0.510386,-8.293705,6.993442,17.189351,0.0,0.0


In [25]:
ulsan_fcst

Unnamed: 0,Forecast_time,Temperature,Humidity,WindSpeed,WindDirection,Cloud,Wind_X,Wind_Y,Day sin,Day cos,Year sin,Year cos,declination,sunrise,altitude,Insolation
0,2018-03-02 00:00:00,0.0,40.000000,1.500000,318.000000,2.000000,1.114717,-1.003696,-0.707107,-7.071068e-01,0.858280,0.513182,-8.293705,6.778422,0.0,0.0
1,2018-03-02 01:00:00,0.0,39.444444,1.511111,318.222222,1.888889,1.126888,-1.006768,-0.866025,-5.000000e-01,0.858648,0.512566,-8.293705,6.778422,0.0,0.0
2,2018-03-02 02:00:00,0.0,38.888889,1.522222,318.444444,1.777778,1.139098,-1.009760,-0.965926,-2.588190e-01,0.859015,0.511951,-8.293705,6.778422,0.0,0.0
3,2018-03-02 03:00:00,0.0,38.333333,1.533333,318.666667,1.666667,1.151349,-1.012673,-1.000000,-3.848660e-12,0.859382,0.511335,-8.293705,6.778422,0.0,0.0
4,2018-03-02 04:00:00,0.0,37.777778,1.544444,318.888889,1.555556,1.163640,-1.015505,-0.965926,2.588190e-01,0.859748,0.510719,-8.293705,6.778422,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26302,2021-03-01 22:00:00,12.0,80.000000,4.233333,27.000000,4.000000,3.771928,1.921893,-0.258819,-9.659258e-01,0.859945,0.510386,-8.293705,6.778422,0.0,0.0
26303,2021-03-01 23:00:00,10.0,80.000000,4.666667,22.000000,4.000000,4.326858,1.748164,-0.500000,-8.660254e-01,0.860311,0.509769,-8.293705,6.778422,0.0,0.0
26304,2021-03-02 00:00:00,8.0,80.000000,5.100000,17.000000,4.000000,4.877154,1.491096,-0.707107,-7.071068e-01,0.860676,0.509153,,,,
26305,2021-03-02 03:00:00,6.0,80.000000,7.200000,10.000000,4.000000,7.090616,1.250267,-1.000000,-7.470864e-12,0.861769,0.507301,,,,


In [26]:
dangjin_fcst.to_csv("data/add_dangjin_wind_time_insolation.csv", index=False)
ulsan_fcst.to_csv("data/add_ulsan_wind_time_insolation.csv", index=False)

## 학습 데이터 전처리

In [27]:
def make_train_data(energy_df, fcst_df, target):
    # 일기 예보 있는 날짜만 선택
    energy = energy_df.loc[24:]
    energy.index = range(energy.shape[0])
    
    # 발전량 데이터가 있는 날짜만 선택
    fcst = fcst_df.loc[:25608-1]
    fcst.index = range(fcst.shape[0])
    
    # 발전량과 일기예보 연결
    concat_df = pd.concat([energy, fcst], axis=1)
    
    # 예보 시간 및 날짜 정보 feature로 추가
    concat_df['date'] = concat_df['Forecast_time'].str.split(' ').str[0]
    concat_df['hour'] = concat_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    concat_df['year'] = concat_df['date'].str.split('-').str[0].astype(int)
    concat_df['month'] = concat_df['date'].str.split('-').str[1].astype(int)
    concat_df['day'] = concat_df['date'].str.split('-').str[2].astype(int)
    
    # 예보 시간, 날짜, 기상 예보 및 발전량 선택
    feature_df = concat_df[[#'year', 'month', 'day', 'hour', 
                            'Year sin', 'Year cos', 'Day sin', 'Day cos',
                            'Temperature', 'Humidity', 
                            #'WindSpeed', 'WindDirection',
                            'Wind_X', 'Wind_Y',
                            'Insolation',
                            'Cloud', target]]


    x_df = train_df.loc[:, :'Cloud'].to_numpy()
    y_df = train_df[target].to_numpy()

    return x_df, y_df

In [28]:
def train_dataset(energy_df, fcst_df, target):
    # 일기 예보 있는 날짜만 선택
    energy = energy_df.loc[24:]
    energy.index = range(energy.shape[0])
    
    # 발전량 데이터가 있는 날짜만 선택
    fcst = fcst_df.loc[:25608-1]
    fcst.index = range(fcst.shape[0])
    
    # 발전량과 일기예보 연결
    concat_df = pd.concat([energy, fcst], axis=1)
    
    # 예보 시간 및 날짜 정보 feature로 추가
    concat_df['date'] = concat_df['Forecast_time'].str.split(' ').str[0]
    concat_df['hour'] = concat_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    concat_df['year'] = concat_df['date'].str.split('-').str[0].astype(int)
    concat_df['month'] = concat_df['date'].str.split('-').str[1].astype(int)
    concat_df['day'] = concat_df['date'].str.split('-').str[2].astype(int)
    
    # 예보 시간, 날짜, 기상 예보 및 발전량 선택
    feature_df = concat_df[[#'year', 'month', 'day', 'hour', 
                            'Year sin', 'Year cos', 'Day sin', 'Day cos',
                            'Temperature', 'Humidity', 
                            #'WindSpeed', 'WindDirection',
                            'Wind_X', 'Wind_Y',
                            'Insolation',
                            'Cloud', target]]

    # 마지막 30일을 검증데이터셋으로 나머지를 학습 데이터셋으로 선택
    train_df = feature_df.iloc[:-24*30]
    val_df = feature_df.iloc[-24*30:]
    
    # 발전량이 0인 데이터를 제외
    train_df = train_df[train_df[target]!=0]
    
    #train_x = train_df.loc[:, 'year':'Cloud'].to_numpy()
    train_x = train_df.loc[:, :'Cloud'].to_numpy()
    train_y = train_df[target].to_numpy()
    
    #val_x = val_df.loc[:, 'year':'Cloud'].to_numpy()
    val_x = val_df.loc[:, :'Cloud'].to_numpy()
    val_y = val_df[target].to_numpy()
    
    return train_x, train_y, val_x, val_y

## LightGBM Custom Metric

In [29]:
def nmae_10(y_pred, dataset):
    y_true = dataset.get_label()
    
    absolute_error = abs(y_true - y_pred)
    absolute_error /= capacity
    
    target_idx = np.where(y_true>=capacity*0.1)
    
    nmae = 100 * absolute_error[target_idx].mean()
    
    return 'score', nmae, False

## Validation Metric

In [30]:
def sola_nmae(answer, pred):
    absolute_error = np.abs(answer - pred)
    
    absolute_error /= capacity
    
    target_idx = np.where(answer>=capacity*0.1)
    
    nmae = 100 * absolute_error[target_idx].mean()
    
    return nmae

## LightGBM Hyperparameter

In [31]:
# params = {
#     'seed':42,
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'metric':'mae',
#     'num_iterations': 10000,
#     'learning_rate': 0.01,
# #     'max_depth': -1, # 최대 깊이 트리를 만듬
#     'max_depth': 16,
#     'num_leaves': 39000,
#     'bagging_freq': 2,
#     'bagging_fraction': 0.7,
#     'feature_fraction': 0.8,
# }

In [32]:
# 2021년 1월 데이터를 테스트 데이터로
def preprocess_df(df_, test_length=31*24+23):
    target_columns = []
    df = df_copy()
    n = len(df)
    train_mean = df[0:int(n*0.9)][target_columns].mean()
    train_std = df[0:int(n*0.9)][target_columns].std()
    
    df[target_columns] = (df[target_columns] - train_mean) / train_std
    
    train_df = df[0:n-test_length]
    test_df = df[n-test_length:]
    
    return train_mean, train_std, train_df, test_df

In [33]:
def custom_nmae(answer_energy, submission_energy, site_name, train_mean, train_std):
    # 발전소 발전용량
    capacity = {
        'dangjin_floating':1000, # 당진수상태양광 발전용량
        'dnagjin_warehouse':700, # 당진자재창고태양광 발전용량
        'dangjin':1000, # 당진태양광 발전용량
        'ulsan':500 # 울산태양광 발전용량
    }
    
    sum_answer = answer_energy * train_std['energy'] + train_mean['energy']
    sum_submission = submission_energy * train_std['energy'] + train_mean['energy']
    
    # 총 발전용량 절대오차
    absolute_error = abs((sum_answer - sum_submission))
    
    # 발전용량으로 정규화
    total_capacity = capacity[site_name]
    absolute_error /= total_capacity
    
    # 총 발전용량의 10% 이상 발전한 데이터 인덱스 추출
    # NMAE(%)
    nmae = 100 * absolute_error[sum_answer>=total_capacity*0.1].mean()
    
    return nmae

## Optuna Hyperparameter Setting

In [34]:
# # 기본 패러미터
# params_default = {
#         'seed':42,
#         'boosting': 'gbdt',
#         'objective': 'regression',
#         'metric':'mae',
#         'num_iterations': 10000,
#         'learning_rate': 0.01,
# }

In [35]:
# 기본 패러미터
params_default = {
    'seed':42,
    'boosting': 'gbdt',
    'objective': 'regression',
    'metric':'mae',
    'num_iterations': 10000,
    'learning_rate': 0.09,
    # 'max_depth': 16,
    # 'num_leaves': 39000,
    # 'bagging_freq': 2,
    # 'bagging_fraction': 0.7,
    # 'feature_fraction': 0.8,
}

In [36]:
def objectiveLGBM(trial: Trial, energy, fcst, target):
    params = {
        'seed':42,
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric':'mae',
        'num_iterations': 10000,
        'learning_rate': 0.09,
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'num_leaves': trial.suggest_int('num_leaves', 2, 65535),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 200),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 5),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 1.5),
        'max_bin': trial.suggest_int('max_bin', 1, 1000)
    }

    train_x, train_y, val_x, val_y = train_dataset(energy, fcst, target)
    train_ds = lgb.Dataset(train_x, train_y)
    val_ds = lgb.Dataset(val_x, val_y)

    # model = LGBMClassifier(**params)
    # model.fit(train_ds, val_dataset, verbose=False)

    model = lgb.train(params, train_ds, valid_sets=val_ds, early_stopping_rounds=50, feval=nmae_10, verbose_eval=10)
    # model = lgb.train(params, train_ds, val_ds, verbose_eval=False)

    pred = model.predict(val_x)
    score = sola_nmae(val_y, pred)

    return score

In [37]:
# 패러미터 그래프가 궁금할때 쓰는 코드들
# # 하이퍼파라미터 최적화 과정을 확인
# optuna.visualization.plot_optimization_history(study)

# # 하이퍼파라미터별 중요도
# optuna.visualization.plot_param_importances(study)

# # 파라미터들관의 관계
# optuna.visualization.plot_parallel_coordinate(study)

In [38]:
def paramsTuning(energy, fcst, target):
    sampler = TPESampler()
    study = optuna.create_study(
        study_name="lgbm_parameter_opt",
        direction="minimize",
        sampler=sampler,
    )
    study.optimize(lambda trial: objectiveLGBM(trial, energy, dangjin_fcst, target='dangjin_floating'), n_trials=20)
    print()
    print("Best Score:", study.best_value)
    print("Best trial:", study.best_trial.params)
    return study

### 당진수상태양광 예측 모델 학습

In [39]:
capacity = 1000

study = paramsTuning(energy, dangjin_fcst, target='dangjin_floating')

[32m[I 2021-06-25 18:10:52,008][0m A new study created in memory with name: lgbm_parameter_opt[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5211
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 104.292	valid_0's score: 9.80344
[20]	valid_0's l1: 60.9957	valid_0's score: 8.88068
[30]	valid_0's l1: 43.4617	valid_0's score: 8.69008
[40]	valid_0's l1: 36.2634	valid_0's score: 8.49088
[50]	valid_0's l1: 34.6975	valid_0's score: 8.4695
[60]	valid_0's l1: 34.5679	valid_0's score: 8.39887
[70]	valid_0's l1: 34.732	valid_0's score: 8.44275
[80]	valid_0's l1: 35.5004	valid_0's score: 8.44272
[90]	valid_0's l1: 36.0871	valid_0's score: 8.47822
[100]	valid_0's l1: 36.3355	valid_0's score: 8.4706
Early stopping, best iteration is:
[56]	valid_0's l1: 34.3296	valid_0's score: 8.44814


[32m[I 2021-06-25 18:10:54,927][0m Trial 0 finished with value: 8.448135265236372 and parameters: {'max_depth': 47, 'num_leaves': 18190, 'min_data_in_leaf': 52, 'lambda_l1': 1.7202575625607563, 'lambda_l2': 7.985872875181379, 'bagging_freq': 2, 'bagging_fraction': 0.9412082247679963, 'feature_fraction': 0.9811385106558971, 'scale_pos_weight': 1.4254341953226353, 'max_bin': 715}. Best is trial 0 with value: 8.448135265236372.[0m
[32m[I 2021-06-25 18:10:55,404][0m Trial 1 finished with value: 7.998997471124417 and parameters: {'max_depth': 10, 'num_leaves': 26386, 'min_data_in_leaf': 62, 'lambda_l1': 2.5973165331323966, 'lambda_l2': 4.269743206075555, 'bagging_freq': 5, 'bagging_fraction': 0.20762098762230163, 'feature_fraction': 0.8436468499606081, 'scale_pos_weight': 1.4237762727367373, 'max_bin': 455}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3385
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 109.862	valid_0's score: 9.7957
[20]	valid_0's l1: 64.2103	valid_0's score: 8.36684
[30]	valid_0's l1: 44.2717	valid_0's score: 8.16292
[40]	valid_0's l1: 36.0781	valid_0's score: 8.21853
[50]	valid_0's l1: 34.9128	valid_0's score: 8.04496
[60]	valid_0's l1: 35.5498	valid_0's score: 8.1074
[70]	valid_0's l1: 37.1606	valid_0's score: 8.17052
[80]	valid_0's l1: 36.5623	valid_0's score: 8.24332
[90]	valid_0's l1: 37.3138	valid_0's score: 8.2313
Early stopping, best iteration is:
[46]	valid_0's l1: 34.5639	valid_0's score: 7.999




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3926
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 150.352	valid_0's score: 11.369
[20]	valid_0's l1: 119.257	valid_0's score: 10.2968
[30]	valid_0's l1: 95.3361	valid_0's score: 9.35056
[40]	valid_0's l1: 70.841	valid_0's score: 8.79338
[50]	valid_0's l1: 63.6344	valid_0's score: 8.59648
[60]	valid_0's l1: 64.6293	valid_0's score: 8.57771
[70]	valid_0's l1: 64.5147	valid_0's score: 8.49029
[80]	valid_0's l1: 65.9207	valid_0's score: 8.57212
[90]	valid_0's l1: 67.693	valid_0's score: 8.59177
Early stopping, best iteration is:
[48]	valid_0's l1: 63.3155	valid_0's score: 8.64094


[32m[I 2021-06-25 18:11:00,677][0m Trial 2 finished with value: 8.640939209482678 and parameters: {'max_depth': 40, 'num_leaves': 32153, 'min_data_in_leaf': 59, 'lambda_l1': 8.758902054674953, 'lambda_l2': 7.580934225781113, 'bagging_freq': 4, 'bagging_fraction': 0.4787781038049873, 'feature_fraction': 0.2209992786441183, 'scale_pos_weight': 1.0331545532914297, 'max_bin': 532}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4164
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 112.386	valid_0's score: 9.91231
[20]	valid_0's l1: 64.3659	valid_0's score: 8.71375
[30]	valid_0's l1: 47.0149	valid_0's score: 8.46665
[40]	valid_0's l1: 38.2781	valid_0's score: 8.56967
[50]	valid_0's l1: 37.4906	valid_0's score: 8.57914
[60]	valid_0's l1: 37.5077	valid_0's score: 8.66921
[70]	valid_0's l1: 37.7829	valid_0's score: 8.43277
[80]	valid_0's l1: 38.6162	valid_0's score: 8.52255
[90]	valid_0's l1: 38.8344	valid_0's score: 8.50841
[100]	valid_0's l1: 38.9567	valid_0's score: 8.40442
Early stopping, best iteration is:
[56]	valid_0's l1: 37.2714	valid_0's score: 8.62983


[32m[I 2021-06-25 18:11:13,157][0m Trial 3 finished with value: 8.629829075082052 and parameters: {'max_depth': 17, 'num_leaves': 51446, 'min_data_in_leaf': 147, 'lambda_l1': 5.616300871915342, 'lambda_l2': 2.7631682826386905, 'bagging_freq': 2, 'bagging_fraction': 0.2486614862713453, 'feature_fraction': 0.7515666354218559, 'scale_pos_weight': 1.0091138230124652, 'max_bin': 566}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 852
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 151.376	valid_0's score: 11.4957
[20]	valid_0's l1: 119.027	valid_0's score: 10.3986
[30]	valid_0's l1: 95.7774	valid_0's score: 9.56629
[40]	valid_0's l1: 71.0213	valid_0's score: 9.00197
[50]	valid_0's l1: 63.6992	valid_0's score: 8.81407
[60]	valid_0's l1: 64.785	valid_0's score: 8.83716
[70]	valid_0's l1: 66.094	valid_0's score: 8.77531
[80]	valid_0's l1: 67.065	valid_0's score: 8.79808
[90]	valid_0's l1: 69.1895	valid_0's score: 8.87237
Early stopping, best iteration is:
[48]	valid_0's l1: 63.5521	valid_0's score: 8.87549


[32m[I 2021-06-25 18:11:14,801][0m Trial 4 finished with value: 8.875485326714665 and parameters: {'max_depth': 30, 'num_leaves': 50872, 'min_data_in_leaf': 38, 'lambda_l1': 7.557774494092099, 'lambda_l2': 5.575845237323037, 'bagging_freq': 3, 'bagging_fraction': 0.4388368651220431, 'feature_fraction': 0.16136941016015702, 'scale_pos_weight': 1.2977095996204535, 'max_bin': 98}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6240
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 103.898	valid_0's score: 9.60341
[20]	valid_0's l1: 60.5087	valid_0's score: 8.85678
[30]	valid_0's l1: 44.0289	valid_0's score: 8.59739
[40]	valid_0's l1: 38.4769	valid_0's score: 8.67593
[50]	valid_0's l1: 37.7191	valid_0's score: 8.64923
[60]	valid_0's l1: 37.0468	valid_0's score: 8.50326
[70]	valid_0's l1: 37.0939	valid_0's score: 8.42438
[80]	valid_0's l1: 37.5645	valid_0's score: 8.25135
[90]	valid_0's l1: 38.0393	valid_0's score: 8.38038
[100]	valid_0's l1: 38.7071	valid_0's score: 8.38798
[110]	valid_0's l1: 38.6786	valid_0's score: 8.48166


Early stopping, best iteration is:
[63]	valid_0's l1: 36.9418	valid_0's score: 8.4784


[32m[I 2021-06-25 18:11:20,225][0m Trial 5 finished with value: 8.47840030583389 and parameters: {'max_depth': 24, 'num_leaves': 27525, 'min_data_in_leaf': 76, 'lambda_l1': 4.723907363372578, 'lambda_l2': 6.694608536927452, 'bagging_freq': 4, 'bagging_fraction': 0.8245781979064247, 'feature_fraction': 0.9806987113690147, 'scale_pos_weight': 1.0167874157844148, 'max_bin': 862}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 126.02	valid_0's score: 10.4368
[20]	valid_0's l1: 86.4386	valid_0's score: 9.37127
[30]	valid_0's l1: 64.8623	valid_0's score: 9.04347
[40]	valid_0's l1: 57.7717	valid_0's score: 8.80374
[50]	valid_0's l1: 55.9211	valid_0's score: 8.74819
[60]	valid_0's l1: 56.7141	valid_0's score: 8.78247
[70]	valid_0's l1: 56.9066	valid_0's score: 8.75641
[80]	valid_0's l1: 57.3928	valid_0's score: 8.75809
[90]	valid_0's l1: 57.4572	valid_0's score: 8.76635
[100]	valid_0's l1: 57.6446	valid_0's score: 8.76394
Early stopping, best iteration is:
[53]	valid_0's l1: 55.708	valid_0's score: 8.71774


[32m[I 2021-06-25 18:11:21,488][0m Trial 6 finished with value: 8.717741966886612 and parameters: {'max_depth': 27, 'num_leaves': 5490, 'min_data_in_leaf': 50, 'lambda_l1': 9.438445291866746, 'lambda_l2': 2.0728414388941774, 'bagging_freq': 4, 'bagging_fraction': 0.9949040361663629, 'feature_fraction': 0.32839466680336127, 'scale_pos_weight': 1.1229727584128615, 'max_bin': 342}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4472
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 144.759	valid_0's score: 11.4625
[20]	valid_0's l1: 120.772	valid_0's score: 11.2037
[30]	valid_0's l1: 107.522	valid_0's score: 10.2391
[40]	valid_0's l1: 90.9423	valid_0's score: 10.1803
[50]	valid_0's l1: 81.2366	valid_0's score: 9.68156
[60]	valid_0's l1: 79.1652	valid_0's score: 9.12224
[70]	valid_0's l1: 77.7929	valid_0's score: 9.07864
[80]	valid_0's l1: 71.9962	valid_0's score: 8.87076
[90]	valid_0's l1: 69.0798	valid_0's score: 8.95597
[100]	valid_0's l1: 69.4997	valid_0's score: 8.81956

[32m[I 2021-06-25 18:11:22,070][0m Trial 7 finished with value: 8.961789561630763 and parameters: {'max_depth': 9, 'num_leaves': 16024, 'min_data_in_leaf': 180, 'lambda_l1': 3.982953877918538, 'lambda_l2': 5.858311907636219, 'bagging_freq': 0, 'bagging_fraction': 0.537173098411144, 'feature_fraction': 0.13028931929088672, 'scale_pos_weight': 1.307319957913615, 'max_bin': 610}. Best is trial 1 with value: 7.998997471124417.[0m



[110]	valid_0's l1: 70.6353	valid_0's score: 8.70349
[120]	valid_0's l1: 70.2179	valid_0's score: 8.70334
[130]	valid_0's l1: 70.8231	valid_0's score: 8.68057
[140]	valid_0's l1: 71.1589	valid_0's score: 8.60977
Early stopping, best iteration is:
[91]	valid_0's l1: 68.458	valid_0's score: 8.96179




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6779
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 149.818	valid_0's score: 11.5585
[20]	valid_0's l1: 119.072	valid_0's score: 10.4545
[30]	valid_0's l1: 94.4432	valid_0's score: 9.59667
[40]	valid_0's l1: 70.1307	valid_0's score: 8.99295
[50]	valid_0's l1: 62.7086	valid_0's score: 8.8293
[60]	valid_0's l1: 63.4629	valid_0's score: 8.80589
[70]	valid_0's l1: 64.5016	valid_0's score: 8.72879
[80]	valid_0's l1: 65.4132	valid_0's score: 8.74693

[32m[I 2021-06-25 18:11:22,630][0m Trial 8 finished with value: 8.839727102224007 and parameters: {'max_depth': 7, 'num_leaves': 64289, 'min_data_in_leaf': 43, 'lambda_l1': 6.990746792848432, 'lambda_l2': 9.434738190412423, 'bagging_freq': 1, 'bagging_fraction': 0.69506038195843, 'feature_fraction': 0.21187153761782995, 'scale_pos_weight': 1.470520156689057, 'max_bin': 939}. Best is trial 1 with value: 7.998997471124417.[0m



[90]	valid_0's l1: 67.5075	valid_0's score: 8.82106
Early stopping, best iteration is:
[48]	valid_0's l1: 62.4721	valid_0's score: 8.83973




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5071


KeyboardInterrupt: 

Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 129.206	valid_0's score: 9.80486
[20]	valid_0's l1: 79.783	valid_0's score: 8.67201
[30]	valid_0's l1: 54.5694	valid_0's score: 8.12441
[40]	valid_0's l1: 45.8255	valid_0's score: 7.98183
[50]	valid_0's l1: 44.6306	valid_0's score: 8.09739
[60]	valid_0's l1: 44.6517	valid_0's score: 7.95967
[70]	valid_0's l1: 45.3161	valid_0's score: 7.94419
[80]	valid_0's l1: 46.3886	valid_0's score: 8.12
[90]	valid_0's l1: 47.8837	valid_0's score: 8.24485
[100]	valid_0's l1: 47.5527	valid_0's score: 8.13465
Early stopping, best iteration is:
[52]	valid_0's l1: 44.4779	valid_0's score: 8.04092


[32m[I 2021-06-25 18:11:30,261][0m Trial 9 finished with value: 8.040917725476985 and parameters: {'max_depth': 28, 'num_leaves': 34114, 'min_data_in_leaf': 71, 'lambda_l1': 1.653218420764344, 'lambda_l2': 2.435265055916862, 'bagging_freq': 3, 'bagging_fraction': 0.22325772956597156, 'feature_fraction': 0.3628407091516196, 'scale_pos_weight': 1.0648278545311094, 'max_bin': 695}. Best is trial 1 with value: 7.998997471124417.[0m
[32m[I 2021-06-25 18:11:30,708][0m Trial 10 finished with value: 8.52452080040508 and parameters: {'max_depth': 15, 'num_leaves': 978, 'min_data_in_leaf': 121, 'lambda_l1': 0.164168115659709, 'lambda_l2': 0.35518947710249016, 'bagging_freq': 5, 'bagging_fraction': 0.11240117457487209, 'feature_fraction': 0.7177304063285554, 'scale_pos_weight': 1.3904250528122912, 'max_bin': 281}. Best is trial 1 with value: 7.998997471124417.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2159
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 117.298	valid_0's score: 10.4458
[20]	valid_0's l1: 71.334	valid_0's score: 9.25997
[30]	valid_0's l1: 50.3186	valid_0's score: 8.93978
[40]	valid_0's l1: 43.3758	valid_0's score: 8.69896
[50]	valid_0's l1: 42.0709	valid_0's score: 8.59269
[60]	valid_0's l1: 43.644	valid_0's score: 8.48943
[70]	valid_0's l1: 44.7307	valid_0's score: 8.79231
[80]	valid_0's l1: 44.9574	valid_0's score: 8.86259
[90]	valid_0's l1: 45.3871	valid_0's score: 8.8089
Early stopping, best iteration is:
[47]	valid_0's l1: 41.8848	valid_0's score: 8.52452




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2561
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 118.707	valid_0's score: 10.2003
[20]	valid_0's l1: 71.6693	valid_0's score: 8.85462
[30]	valid_0's l1: 48.1556	valid_0's score: 8.40918
[40]	valid_0's l1: 39.9213	valid_0's score: 8.20583
[50]	valid_0's l1: 39.6759	valid_0's score: 7.97894
[60]	valid_0's l1: 40.253	valid_0's score: 8.02002
[70]	valid_0's l1: 42.7278	valid_0's score: 8.15128
[80]	valid_0's l1: 42.9315	valid_0's score: 8.38377
[90]	valid_0's l1: 42.8838	valid_0's score: 8.41102
Early stopping, best iteration is:
[45]	valid_0's l1: 39.0163	valid_0's score: 7.98008


[32m[I 2021-06-25 18:11:40,402][0m Trial 11 finished with value: 7.9800780102777065 and parameters: {'max_depth': 35, 'num_leaves': 44050, 'min_data_in_leaf': 90, 'lambda_l1': 2.223657279042289, 'lambda_l2': 3.4494128948290688, 'bagging_freq': 5, 'bagging_fraction': 0.25505389680246005, 'feature_fraction': 0.46650360691759557, 'scale_pos_weight': 1.1645783359349124, 'max_bin': 338}. Best is trial 11 with value: 7.9800780102777065.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2673


KeyboardInterrupt: 

Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 112.628	valid_0's score: 9.86969
[20]	valid_0's l1: 68.5165	valid_0's score: 8.64618
[30]	valid_0's l1: 44.0703	valid_0's score: 8.22859
[40]	valid_0's l1: 37.1123	valid_0's score: 8.11819
[50]	valid_0's l1: 37.7914	valid_0's score: 8.01639
[60]	valid_0's l1: 38.8616	valid_0's score: 8.07937
[70]	valid_0's l1: 41.336	valid_0's score: 8.20854
[80]	valid_0's l1: 41.7065	valid_0's score: 8.31929
[90]	valid_0's l1: 42.6248	valid_0's score: 8.26562
Early stopping, best iteration is:
[43]	valid_0's l1: 36.4281	valid_0's score: 7.97077


[32m[I 2021-06-25 18:11:49,604][0m Trial 12 finished with value: 7.970770912056412 and parameters: {'max_depth': 37, 'num_leaves': 44151, 'min_data_in_leaf': 103, 'lambda_l1': 2.5920865455502464, 'lambda_l2': 4.252824937144327, 'bagging_freq': 5, 'bagging_fraction': 0.3182227575676504, 'feature_fraction': 0.5707244594428287, 'scale_pos_weight': 1.1886166610062943, 'max_bin': 354}. Best is trial 12 with value: 7.970770912056412.[0m


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 969
[LightGBM] [Info] Number of data points in the train set: 12378, number of used features: 10
[LightGBM] [Info] Start training from score 247.345048
Training until validation scores don't improve for 50 rounds
[10]	valid_0's l1: 118.013	valid_0's score: 10.0193
[20]	valid_0's l1: 69.4594	valid_0's score: 8.68071
[30]	valid_0's l1: 46.4481	valid_0's score: 8.31835
[40]	valid_0's l1: 38.6288	valid_0's score: 8.18287
[50]	valid_0's l1: 39.0674	valid_0's score: 7.99748
[60]	valid_0's l1: 39.6941	valid_0's score: 8.05191
[70]	valid_0's l1: 40.781	valid_0's score: 8.02303
[80]	valid_0's l1: 41.4958	valid_0's score: 8.1199
[90]	valid_0's l1: 42.983	valid_0's score: 8.20606
Early stopping, best iteration is:
[44]	valid_0's l1: 37.7621	valid_0's score: 7.98733


KeyboardInterrupt: 

Best Score: 7.835342227109835  
Best trial: {'seed': 42,
 'boosting': 'gbdt',
 'objective': 'regression',
 'metric': 'mae',
 'num_iterations': 10000,
 'learning_rate': 0.09,
 'max_depth': 16,
 'num_leaves': 32064,
 'min_data_in_leaf': 20,
 'lambda_l1': 8.608420719789926,
 'lambda_l2': 6.392889553947609,
 'bagging_freq': 1,
 'bagging_fraction': 0.6914217180575984,
 'feature_fraction': 0.4633404126275907,
 'scale_pos_weight': 1.0047629559940852,
 'max_bin': 33}

In [None]:
params_floating = params_default.copy()
params_floating.update(study.best_trial.params)
params_floating

In [None]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [None]:
# 하이퍼파라미터별 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [None]:

def kfold_lgbm(params, n_split, energy, fcst, target):
    cv_scores = []
    x_df, y_df = train_dataset(energy, fcst, target)

    folds = KFold(n_splits=n_split, shuffle=True, random_state=42)
    splits = folds.split(x_df, y_df)

    for index, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {index} ============\n")
        train_x, val_x = x_df.iloc[train_idx], x_df.iloc[valid_idx]
        train_y, val_y = y_df.iloc[train_idx], y_df.iloc[valid_idx]

        # 발전량이 0인 데이터를 제외
        train_x = train_x[train_y[target]!=0]
        train_y = train_y[train_y[target]!=0]

        train_dataset = lgb.Dataset(train_x, train_y)
        val_dataset = lgb.Dataset(val_x, val_y)

    capacity = 1000
    model = lgb.train(params, train_dataset, val_dataset, feval=nmae_10, verbose_eval=10, early_stopping_rounds=50)

In [None]:
train_x, train_y, val_x, val_y = train_dataset(energy, dangjin_fcst, target='dangjin_floating')
train_dataset = lgb.Dataset(train_x, train_y)
val_dataset = lgb.Dataset(val_x, val_y)
capacity = 1000
dangjin_floating_model = lgb.train(params_floating, train_dataset, 10000, val_dataset, feval=nmae_10, verbose_eval=500, early_stopping_rounds=100)

In [None]:
pred = dangjin_floating_model.predict(val_x)

plt.figure(figsize=(20,5))
plt.plot(val_y, label='true')
plt.plot(pred, label='pred')
plt.legend()
plt.show()
print('CV Score : ', sola_nmae(val_y, pred))

### 당진자재창고태양광 예측 모델 학습

In [None]:
capacity = 700

sampler = TPESampler()
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(lambda trial: objectiveLGBM(trial, energy, dangjin_fcst, target='dangjin_warehouse'), n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [None]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [None]:
# 하이퍼파라미터별 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params_warehouse = params_default.copy()
params_warehouse.update(study.best_trial.params)

In [None]:
params_warehouse

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_fcst, target='dangjin_warehouse')
train_dataset = lgb.Dataset(train_x, train_y)
val_dataset = lgb.Dataset(val_x, val_y)
capacity = 700
dangjin_warehouse_model = lgb.train(params_warehouse, train_dataset, 10000, val_dataset, feval=nmae_10, verbose_eval=500, early_stopping_rounds=100)

In [None]:
pred = dangjin_warehouse_model.predict(val_x)

plt.figure(figsize=(20,5))
plt.plot(val_y, label='true')
plt.plot(pred, label='pred')
plt.legend()
plt.show()
print('CV Score : ', sola_nmae(val_y, pred))

### 당진태양광 예측 모델 학습

In [None]:
capacity = 1000

sampler = TPESampler()
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(lambda trial: objectiveLGBM(trial, energy, dangjin_fcst, target='dangjin'), n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [None]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [None]:
# 하이퍼파라미터별 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params_dangjin = params_default.copy()
params_dangjin.update(study.best_trial.params)

In [None]:
params_dangjin

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, dangjin_fcst, target='dangjin')
train_dataset = lgb.Dataset(train_x, train_y)
val_dataset = lgb.Dataset(val_x, val_y)
capacity = 1000
dangjin_model = lgb.train(params_dangjin, train_dataset, 10000, val_dataset, feval=nmae_10, verbose_eval=500, early_stopping_rounds=100)

In [None]:
pred = dangjin_model.predict(val_x)

plt.figure(figsize=(20,5))
plt.plot(val_y, label='true')
plt.plot(pred, label='pred')
plt.legend()
plt.show()
print('CV Score : ', sola_nmae(val_y, pred))

### 울산태양광 예측 모델 학습

In [None]:
capacity = 500

sampler = TPESampler()
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(lambda trial: objectiveLGBM(trial, energy, ulsan_fcst, target='ulsan'), n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [None]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

In [None]:
# 하이퍼파라미터별 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params_ulsan = params_default.copy()
params_ulsan.update(study.best_trial.params)

In [None]:
params_ulsan

In [None]:
train_x, train_y, val_x, val_y = train_datast(energy, ulsan_fcst, target='ulsan')
train_dataset = lgb.Dataset(train_x, train_y)
val_dataset = lgb.Dataset(val_x, val_y)
capacity = 500
ulsan_model = lgb.train(params_ulsan, train_dataset, 10000, val_dataset, feval=nmae_10, verbose_eval=500, early_stopping_rounds=100)

In [None]:
pred = ulsan_model.predict(val_x)

plt.figure(figsize=(20,5))
plt.plot(val_y, label='true')
plt.plot(pred, label='pred')
plt.legend()
plt.show()
print('CV Score : ', sola_nmae(val_y, pred))

## 테스트 데이터 전처리

In [None]:
def test_datast(fcst_df):
    start = '2021-02-01 00:00:00'
    end = '2021-02-28 23:00:00'
    
    start_idx = fcst_df[fcst_df['Forecast_time']==start].index[0]
    end_idx = fcst_df[fcst_df['Forecast_time']==end].index[0]
    
    test_df = fcst_df.loc[start_idx:end_idx, :].copy()
    
    test_df['date'] = test_df['Forecast_time'].str.split(' ').str[0]
    test_df['hour'] = test_df['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)
    
    test_df['year'] = test_df['date'].str.split('-').str[0].astype(int)
    test_df['month'] = test_df['date'].str.split('-').str[1].astype(int)
    test_df['day'] = test_df['date'].str.split('-').str[2].astype(int)
    
    #test_df = test_df[['year', 'month', 'day', 'hour', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]
    test_df = test_df[[ #'year', 'month', 'day', 'hour', 
                        'Year sin', 'Year cos', 'Day sin', 'Day cos',
                        'Temperature', 'Humidity', 
#                         'WindSpeed', 'WindDirection',
                        'Wind_X', 'Wind_Y',
                        'Insolation',
                        'Cloud']]
    
    test_x = test_df.to_numpy()
    
    return test_x

In [None]:
dangjin_test = test_datast(dangjin_fcst)
ulsan_test = test_datast(ulsan_fcst)

## 각 발전소 발전량 추론

In [None]:
dangjin_floating_pred = dangjin_floating_model.predict(dangjin_test)
dangjin_warehouse_pred = dangjin_warehouse_model.predict(dangjin_test)
dangjin_pred = dangjin_model.predict(dangjin_test)
ulsan_pred = ulsan_model.predict(ulsan_test)

In [None]:
plt.figure(figsize=(20,5))
plt.plot(dangjin_floating_pred, label = 'dangjin_floating_pred')
plt.plot(dangjin_warehouse_pred, label = 'dangjin_warehouse_pred')
plt.plot(dangjin_pred, label = 'dangjin_pred')
plt.plot(ulsan_pred, label = 'ulsan_pred')
plt.legend()
plt.show()

## 제출

In [None]:
submission = pd.read_csv('data/sample_submission.csv')

In [None]:
submission.iloc[:24*28, 1] = dangjin_floating_pred
submission.iloc[:24*28, 2] = dangjin_warehouse_pred
submission.iloc[:24*28, 3] = dangjin_pred
submission.iloc[:24*28, 4] = ulsan_pred

In [None]:
submission

In [None]:
submission.to_csv('submission/wind_time_insolation_optuna_ensemble.csv', index=False)
# submission.to_csv('dacon_baseline_time_insolation.csv', index=False)

In [None]:
plt.figure(figsize=(20,5))
plt.plot(dangjin_fcst['Insolation'] * 1000, label='Insolation')
plt.plot(energy['dangjin_floating'], label='dangjin_floating')
#plt.plot([declination(i/10) for i in range(3650)], label='true')
plt.legend()
plt.show()

In [None]:
# 일기 예보 있는 날짜만 선택
energy_ = energy.loc[24:]
energy_.index = range(energy_.shape[0])

# 발전량 데이터가 있는 날짜만 선택
dangjin_fcst_ = dangjin_fcst.loc[:25608-1]
dangjin_fcst_.index = range(dangjin_fcst_.shape[0])

# 예보 시간 및 날짜 정보 feature로 추가
dangjin_fcst_['date'] = dangjin_fcst_['Forecast_time'].str.split(' ').str[0]
dangjin_fcst_['hour'] = dangjin_fcst_['Forecast_time'].str.split(' ').str[1].str.split(':').str[0].astype(int)

dangjin_fcst_['year'] = dangjin_fcst_['date'].str.split('-').str[0].astype(int)
dangjin_fcst_['month'] = dangjin_fcst_['date'].str.split('-').str[1].astype(int)
dangjin_fcst_['day'] = dangjin_fcst_['date'].str.split('-').str[2].astype(int)

# 발전량과 일기예보 연결
heat_df = pd.concat([energy_, dangjin_fcst_], axis=1)
heat_df = heat_df[[ #'month', 'hour', 'day', 'hour',
                    #'Year sin', 'Year cos', 'Day sin', 'Day cos',
                    'Temperature', 'Humidity', 
#                     'WindSpeed', 'WindDirection',
                    'Wind_X', 'Wind_Y',
                    'Insolation',
                    'Cloud',
                    'dangjin']]

In [None]:
heat_corr = heat_df.corr()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data = heat_corr, annot=True, fmt = '.2f')