#Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
!pip install -q sktime
!pip install optuna
from tqdm import tqdm
import xgboost as xgb
import pandas as pd
import numpy as np
from datetime import datetime
import math
import os
from xgboost import XGBRegressor
import platform
import optuna
from sktime.forecasting.model_selection import temporal_train_test_split

if platform.system() == "Linux":
    linux_version = platform.uname().release
    print("Linux version:", linux_version)

np.random.seed(42)

print("Python version:", sys.version)
print("xgboost version:", xgb.__version__)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("optuna version:", optuna.__version__)

base_path = '/content/drive/MyDrive/4-1/DScover7기/가이드프로젝트/dataset/' # 본인의 datapath를 입력

def SMAPE(true, pred): # Symmetric Mean Absolute Percentage Error(SMAPE)를 계산하는 함수

   # SMAPE는 실제 값과 예측 값 사이의 차이를 백분율로 나타내는 정확도의 지표
   # 이 메트릭은 0과 200 사이의 값을 가지며, 값이 낮을수록 예측의 정확도가 높다는 것을 의미한다.

   # true: 실제 값이 담긴 numpy 배열
   # pred: 예측 값이 담긴 numpy 배열

    return np.mean((np.abs(true - pred)) / (np.abs(true) + np.abs(pred))) * 200

def weighted_mse(alpha=1): # MSE 계산 함수

   # 이 함수는 예측 오차에 가중치를 적용하여 더 큰 오차에 대해 더 큰 패널티를 부과하거나, 반대로 더 작은 오차에 대해 더 작은 패널티를 부과할 수 있다.
   # alpha 매개변수를 통해 오차에 대한 가중치를 조정할 수 있습니다.

   # alpha: 실제 값과 예측 값 사이의 오차가 양수일 때 적용되는 가중치. 기본값은 1

   # 반환 값:
   # weighted_mse_fixed 함수: 이 함수는 실제 레이블과 예측된 값을 매개변수로 받아 Gradient와 2차 미분 값(hess)을 반환한다.

    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        # 오차가 양수인 경우와 그렇지 않은 경우에 대해 다른 Gradient를 계산
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        # 오차가 양수인 경우와 그렇지 않은 경우에 대해 다른 hess을 계산
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess

    return weighted_mse_fixed

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.9/21.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected p

#함수 선언

In [3]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
import math

def add_data(df):# 데이터프레임에 대해 각 관측치에 대해 무작위의 변화를 주어 데이터를 증식
    for i in range(2): # 아래의 과정을 2번 반복
        np.random.seed(i)
        num_rows = len(df) # 주어진 데이터프레임(df)의 각 행에 대해
        random_factors = ['temp', 'prec', 'wind', 'hum'] # 주어진 환경 요인('temp', 'prec', 'wind', 'hum')에 대해

        random_data = {
            factor: np.round(df[factor] * np.random.uniform(0.9, 1.1, num_rows), 1) # 0.9에서 1.1 사이의 난수를 곱하여 새로운 데이터를 생성
            for factor in random_factors
        }

        new_df = df.copy()
        new_df.update(pd.DataFrame(random_data))
        df = pd.concat([df, new_df], ignore_index=True) # 이를 원본 데이터프레임에 추가

    df = df.sort_values(by=['building', 'date_time']).reset_index(drop=True) # 최종적으로 'building'과 'date_time'을 기준으로 정렬된 데이터프레임을 반환
    return df

In [None]:
def weather(train): # 'prec'(강수량) 특성을 기반으로 'weather' 특성을 생성
    condition = train['prec'] > 0
    filtered_df = train[condition].index.tolist()
    train['weather'] = 0

    for idx in filtered_df:
        for offset in range(-3, 4): # 강수량이 0보다 큰 관측치 주변 (+/- 3시간)
            new_idx = idx + offset
            if 0 <= new_idx < len(train):
                train.loc[new_idx, 'weather'] = 1 # 'weather' 특성을 1로 설정하여 비가 온 것으로 표시

    return train

In [None]:
def time_features(data, mode): # 날짜와 시간에서 파생된 여러 시간 관련 특성들을 생성
    date = pd.to_datetime(data['date_time'])
    # 시간, 요일, 월, 주, 일
    data['hour'] = date.dt.hour
    data['dow'] = date.dt.weekday
    data['month'] = date.dt.month
    data['week'] = date.dt.isocalendar().week.astype(np.int32)
    data['day'] = date.dt.day

    # 시간의 사인 및 코사인 변환(시간의 주기성을 반영하기 위한 작업임)
    data['sin_time'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['cos_time'] = np.cos(2 * np.pi * data['hour'] / 24)

    # 특정 건물과 날짜에 대한 공휴일 특성을 설정
    data['holiday'] = data['dow'].apply(lambda x: 0 if x < 5 else 1)
    data['date'] = date.dt.date

    building_dates = [['2022-06-07', '2022-06-17'], ['2022-07-31', '2022-07-23', '2022-07-20'], ['2022-08-16', '2022-08-17']]

    for index, b in enumerate([2, 3 ,54]):
        data.loc[data['building'] == b, 'holiday'] = 0
        data.loc[(data['building'] == b) & (data['dow'] == 0) , 'holiday'] = 1
        data.loc[(data['building'] == b) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1

    data.loc[(data['building'] != 14) & (data['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-01', '2022-06-06', '2022-08-15']])), 'holiday'] = 1
    data.loc[(data['building'] == 14) & (data['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-14']])) , 'holiday'] = 1
    data.loc[data['building'] == 85, 'holiday'] = 0

    def week_of_month(date): # 입력날짜가 해당월의 짝수 주의 일요일인지를 판별(마트 휴일)
        first_day = date.replace(day=1)
        if (date.isocalendar().week - first_day.isocalendar().week + 1) % 2 == 0:
            if date.weekday() == 6:
                return 1
        return 0

    data['week_of_month'] = data['date'].apply(week_of_month)

    target_buildings = [87,88,89,90,91,92] # 마트 건물 번호
    data.loc[(data['building'].isin(target_buildings)) , 'holiday'] = 0
    data.loc[(data['building'].isin(target_buildings)) & (data['week_of_month'] == 1), 'holiday'] = 1

    building_dates = [['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17'], ['2022-06-13', '2022-07-25', '2022-08-01'],
                     ['2022-07-18', '2022-08-08'], ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08'],
                     ['2022-06-27', '2022-07-25', '2022-08-08'], ['2022-06-13', '2022-07-11', '2022-08-22'],
                     ['2022-06-10', '2022-08-10', '2022-07-10', '2022-07-24', '2022-06-26', '2022-08-28']]

    # 'mode' 매개변수에 따라 특정 처리가 다르게 적용
    if mode == 'byb' or mode == 'gu_byb':
        for index, b in enumerate([37,38,39,40,41,42,86]):
            data.loc[data['building'] == b, 'holiday'] = 0
            data.loc[(data['building'] == b) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1

    if mode == 'all' or mode == 'gu_all':
        data.loc[data['building'] == 86, 'holiday'] = 0
        data.loc[(data['building'] == 86) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[-1]+['2022-07-30']])), 'holiday'] = 1

    data['date'] = pd.to_datetime(data['date_time'], format='%Y-%m-%d')
    return data

In [None]:
def side_indicator(data):
    data['THI'] = 9/5 * data['temp'] - 0.55 * (1 - data['hum']/100) * (9/5 * data['temp'] - 26) + 32 # 볼쾌지수(THI) 계산식
    data['WC'] = 13.12 + 0.6215 * data['temp'] - 13.947 * np.power(data['wind'], 0.16) + 0.486 * data['temp'] * np.power(data['wind'], 0.16) # 체감온도(WC) 계산식

    def calculate_cdh(xs): # 누적냉방도수(CDH) 계산식
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)] - 26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)] - 26))
        return np.array(ys)

    cdhs = []
    for num in range(1, 101):
        temp = data[data['building'] == num]
        cdh = calculate_cdh(temp['temp'].values)
        cdhs.extend(cdh)
    data['CDH'] = cdhs

    return data

def temp_features(data): # 평균, 최대, 최소 온도 및 온도 차이를 기반으로 여러 온도 관련 특성을 계산
    # 평균, 최대, 최소 온도 계산 및 데이터프레임에 병합

    # 평균
    avg_temp = pd.pivot_table(data[data['hour']%3 == 0], values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.mean).reset_index()
    avg_temp.rename(columns={'temp': 'avg_temp'}, inplace=True)
    data = pd.merge(data, avg_temp, on=['building', 'day', 'month'], how='left')

    # 최대
    max_temp = pd.pivot_table(data, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.max).reset_index()
    max_temp.rename(columns={'temp': 'max_temp'}, inplace=True)
    data = pd.merge(data, max_temp, on=['building', 'day', 'month'], how='left')

    # 최소
    min_temp = pd.pivot_table(data, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.min).reset_index()
    min_temp.rename(columns={'temp': 'min_temp'}, inplace=True)
    data = pd.merge(data, min_temp, on=['building', 'day', 'month'], how='left')

    data['temp_diff'] = data['max_temp'] - data['min_temp']

    return data

def process_data(data, mode): # 데이터 전처리의 주 함수로, 다른 함수들을 호출하여 데이터에 여러 특성을 추가하고 누락된 값들을 처리
    # 데이터 전처리 단계 호출
    data['wind'] = data['wind'].fillna(method='ffill')
    data['hum'] = data['hum'].fillna(method='ffill')
    data = data.fillna(0)

    data = time_features(data, mode)
    data = side_indicator(data)
    data = temp_features(data)

    data['summer_cos'] = data['date'].apply(summer_cos)
    data['summer_sin'] = data['date'].apply(summer_sin)

    return data



In [None]:
def mean_std(train, test, mode):
    ratio = np.array([0.985]+[0.98]*2+[0.995]*2+[0.99]*2)
    if mode == 'byb':
        train['target'] = train.apply(lambda row: row['target'] * ratio[row['dow']], axis=1)
    elif mode == 'all':
        ratio -= 0.005
        train['target'] = train.apply(lambda row: row['target'] * ratio[row['dow']], axis=1)

    # 평균 및 표준편차 계산 및 데이터프레임에 병합
    power_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'dow'], aggfunc = np.mean).reset_index()
    power_mean.rename(columns={'target': 'dow_hour_mean'}, inplace=True)
    train = pd.merge(train, power_mean, on=['building', 'hour', 'dow'], how='left')
    test = pd.merge(test, power_mean, on=['building', 'hour', 'dow'], how='left')

    power_holiday_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.mean).reset_index()
    power_holiday_mean.rename(columns={'target': 'holiday_mean'}, inplace=True)
    train = pd.merge(train, power_holiday_mean, on=['building', 'hour', 'holiday'], how='left')
    test = pd.merge(test, power_holiday_mean, on=['building', 'hour', 'holiday'], how='left')

    power_holiday_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.std).reset_index()
    power_holiday_std.rename(columns={'target': 'holiday_std'}, inplace=True)
    train = pd.merge(train, power_holiday_std, on=['building', 'hour', 'holiday'], how='left')
    test = pd.merge(test, power_holiday_std, on=['building', 'hour', 'holiday'], how='left')

    power_hour_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.mean).reset_index()
    power_hour_mean.rename(columns={'target': 'hour_mean'}, inplace=True)
    train = pd.merge(train, power_hour_mean, on=['building', 'hour', ], how='left')
    test = pd.merge(test, power_hour_mean, on=['building', 'hour', ], how='left')

    power_hour_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.std).reset_index()
    power_hour_std.rename(columns={'target': 'hour_std'}, inplace=True)
    train = pd.merge(train, power_hour_std, on=['building', 'hour', ], how='left')
    test = pd.merge(test, power_hour_std, on=['building', 'hour', ], how='left')

    if mode == 'all' or mode == 'gu_all':
        train['date'] = pd.to_datetime(train['date_time'], format='%Y-%m-%d').dt.date
        test['date'] = pd.to_datetime(test['date_time'], format='%Y-%m-%d').dt.date
        building_dates = [['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17'], ['2022-06-13', '2022-07-25', '2022-08-01'],
                         ['2022-07-18', '2022-08-08'], ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08'],
                         ['2022-06-27', '2022-07-25', '2022-08-08'], ['2022-06-13', '2022-07-11', '2022-08-22']]

        for index, b in enumerate([37,38,39,40,41,42]):
            train.loc[train['building'] == b, 'holiday'] = 0
            train.loc[(train['building'] == b) & (train['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1
            test.loc[test['building'] == b, 'holiday'] = 0
            test.loc[(test['building'] == b) & (test['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1

    return train, test

In [None]:
def process_info(data, is_train=True): # 건물 정보 데이터를 전처리하는 함수
    # 건물 정보 데이터 전처리
    data.columns = ['building', 'type', 'all_area', 'cool_area', 'sun']
    data['sun'] = data['sun'].replace('-', 0).astype('float')

    value_dict = {value: index for index, value in enumerate(data['type'].unique())}
    data['type'] = data['type'].map(value_dict)

    # 조건에 따라 'cool_area' 값을 조정
    filtered_data = data[(data['type'] == 7) & (data['cool_area'] != 0)]
    result = (filtered_data['all_area'].iloc[1:].sum() / filtered_data['cool_area'].iloc[1:].sum())
    condition = (data['type'] == 7) & (data['cool_area'] == 0)
    data.loc[condition, 'cool_area'] = (data.loc[condition, 'all_area'] / result).astype('int')

    filtered_data = data[(data['type'] == 9) & (data['cool_area'] > 500)]
    result = (filtered_data['all_area'].sum() / filtered_data['cool_area'].sum())
    condition = (data['type'] == 9) & (data['cool_area'] < 500)
    data.loc[condition, 'cool_area'] = round(data.loc[condition, 'all_area'] / result, 1)

    return data

In [None]:
def get_train_and_test_data(mode): # 훈련 데이터와 테스트 데이터를 불러오고, 전처리 과정을 수행한 뒤 반환하는 함수
    # 데이터 로딩 및 전처리 과정 수행
    building_info = pd.read_csv(os.path.join(base_path,'building_info.csv')).drop(['ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1)
    building_info = process_info(building_info)

    train_data = pd.read_csv(os.path.join(base_path,'train.csv')).drop(['일조(hr)', '일사(MJ/m2)'], axis=1)
    train_data.columns = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum', 'target']
    train_data = process_data(train_data, mode)

    test_data = pd.read_csv(os.path.join(base_path,'test.csv'))
    test_data.columns = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum']
    test_data = process_data(test_data, mode)

    train_data, test_data = mean_std(train_data, test_data, mode)

    if mode == 'all' or mode == 'gu_all':
        train_data = train_data.merge(building_info, on='building', how='left')
        test_data = test_data.merge(building_info, on='building', how='left')

    return train_data, test_data

In [None]:
def summer_cos(date): # 주어진 날짜의 코사인 값을 계산하여 여름철의 주기성을 반영
    start_date = datetime.strptime("2022-06-01", "%Y-%m-%d")
    end_date = datetime.strptime("2022-09-14", "%Y-%m-%d")

    period = (end_date - start_date).total_seconds()

    return math.cos(2 * math.pi * (date - start_date).total_seconds() / period)

def summer_sin(date): # 주어진 날짜의 사인 값을 계산하여 여름철의 주기성을 반영
    start_date = datetime.strptime("2022-06-01", "%Y-%m-%d")
    end_date = datetime.strptime("2022-09-14", "%Y-%m-%d")

    period = (end_date - start_date).total_seconds()

    return math.sin(2 * math.pi * (date - start_date).total_seconds() / period)

In [None]:
gu_byb = ['num_date_time', 'building', 'date_time', 'temp', 'wind', 'hum',
       'dow', 'month', 'week', 'dow_hour_mean', 'holiday',
       'holiday_mean', 'holiday_std', 'hour_mean', 'hour_std', 'sin_time',
       'cos_time', 'THI', 'WC', 'CDH', 'target']

train, test = get_train_and_test_data('gu_byb')

train = train[gu_byb]
test = test[gu_byb[:-1]]

scores = []
best_it = []

score = pd.DataFrame({'building':range(1,101)})
for i in tqdm(range(100)):
    y = train.loc[train.building == i+1, 'target']
    x = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)

    xgb = XGBRegressor(colsample_bytree=0.8, eta=0.01, max_depth=5,
             min_child_weight=6,n_estimators=2000, subsample=0.9, early_stopping_rounds=50, eval_metric=SMAPE)

    xgb.set_params(**{'objective':weighted_mse(100)})

    xgb.fit(x_train, y_train, eval_set=[(x_train, y_train),
                                            (x_valid, y_valid)], verbose=False)

    y_pred = xgb.predict(x_valid)

    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb.best_iteration+1)

score['score'] = scores
print(sum(scores)/len(scores))
print(sum(best_it)/len(best_it))
# 4.404954637397799
# 451.67

preds = np.array([])

for i in tqdm(range(100)):
    pred_df = pd.DataFrame()

    for seed in [0,1,2,3,4]:
        y_train = train.loc[train.building == i+1, 'target']
        x_train = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
        x_test = test.loc[test.building == i+1, ].iloc[:,3:]

        xgb = XGBRegressor(colsample_bytree=0.8, eta=0.01, max_depth=5, seed=seed,
                 min_child_weight=6,n_estimators=best_it[i], subsample=0.9)

        xgb.fit(x_train, y_train)
        y_pred = xgb.predict(x_test)
        pred_df.loc[:,seed] = y_pred

    pred = pred_df.mean(axis=1)
    preds = np.append(preds, pred)

submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
submission['answer'] = preds
submission.to_csv('./gu_byb.csv', index = False)

del train, test, scores, best_it, submission

100%|██████████| 100/100 [03:01<00:00,  1.81s/it]


4.404954637397799
451.67


100%|██████████| 100/100 [05:16<00:00,  3.17s/it]


In [None]:
train, test = get_train_and_test_data('gu_all')

In [None]:
# 10시간 정도 소요됩니다.

# import optuna
# import optuna.logging
# from tqdm import tqdm

# optuna.logging.set_verbosity(optuna.logging.WARNING)
# train['date'] = pd.to_datetime(train['date'])
# train['building'] = train['building'].astype('category')
# train['type'] = train['type'].astype('category')

# x_train = train[train['date'] < '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
# x_valid = train[train['date'] >= '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
# y_train = train[train['date'] < '2022-08-18']['target']
# y_valid = train[train['date'] >= '2022-08-18']['target']

# dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
# dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)

# def objective(trial):
#     param = {
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
#         'gamma': trial.suggest_float('gamma', 1e-3, 10),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
#         'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
#         'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
#         'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6]),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#         'eta' : 0.1
#     }

#     model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
#                       evals=[(dvalid, 'valid')], early_stopping_rounds=50, verbose_eval=False)

#     preds = model.predict(dvalid)
#     smape = SMAPE(y_valid, preds)

#     return smape

# study = optuna.create_study(direction='minimize', study_name=None)
# with tqdm(total=500) as pbar:
#     def callback(study, trial):
#         pbar.update(1)

#     study.optimize(objective, n_trials=500, callbacks=[callback])

# df = study.trials_dataframe().sort_values(by=['value'], ascending=[True]).reset_index(drop=True)
# df.to_csv('parameters.csv', index=False)
# df.head(5)

In [None]:
colsample_bytree = [0.9,0.9,0.8,0.9,0.9]
gamma = [8.161415, 8.915918, 8.249356, 6.575258, 8.487247]
max_depth = [6,5,6,6,6]
min_child_weight = [47,8,30,54,77]
reg_alpha = [6.721675, 6.736361, 7.109872, 7.016596, 6.186162]
reg_lambda = [6.064121, 5.957454, 5.927528, 5.413840, 4.798086]
subsample = [1.0,1.0,1.0,1.0,1.0]
value = [5.010795, 5.016419, 5.017388, 5.023210, 5.028135]

df = pd.DataFrame({'params_colsample_bytree':colsample_bytree, 'params_gamma':gamma, 'params_max_depth':max_depth, 'params_min_child_weight':min_child_weight,
             'params_reg_alpha':reg_alpha, 'params_reg_lambda':reg_lambda, 'params_subsample':subsample, 'value':value})

In [None]:
import xgboost as xgb

gu_all = ['num_date_time', 'building', 'date_time', 'temp', 'wind', 'hum',
       'type', 'all_area', 'cool_area', 'dow', 'month', 'week',
       'dow_hour_mean', 'date', 'holiday', 'holiday_mean', 'holiday_std',
       'hour_mean', 'hour_std', 'sin_time', 'cos_time', 'THI', 'WC', 'CDH', 'target']

train, test = train[gu_all], test[gu_all[:-1]]

train['date'] = pd.to_datetime(train['date'])
train['building'] = train['building'].astype('category')
train['type'] = train['type'].astype('category')

x_train = train[train['date'] < f'2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1).reset_index(drop=True)
x_valid = train[train['date'] >= f'2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1).reset_index(drop=True)
y_train = train[train['date'] < f'2022-08-18']['target'].reset_index(drop=True)
y_valid = train[train['date'] >= f'2022-08-18']['target'].reset_index(drop=True)

dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)


param = {
    'reg_lambda': df['params_reg_lambda'][0] ,
    'gamma': df['params_gamma'][0],
    'reg_alpha': df['params_reg_alpha'][0] ,
    'colsample_bytree': df['params_colsample_bytree'][0] ,
    'subsample': df['params_subsample'][0] ,
    'max_depth': df['params_max_depth'][0],
    'min_child_weight': df['params_min_child_weight'][0],
}

model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
                  evals=[(dvalid, 'valid')], early_stopping_rounds=100, verbose_eval=False)

preds = model.predict(dvalid)
smape = SMAPE(y_valid, preds)

best_it = model.best_iteration+1
building_score = []
for i in range(100):
    building_score.append(SMAPE(y_valid[i*168:(i+1)*168], preds[i*168:(i+1)*168]))

score['score_all'] = building_score
score.to_csv('./score.csv', index=False)
print(smape, best_it)

5.010795060857856 251


In [None]:
preds = np.array([])

test = test.copy()
test['date'] = pd.to_datetime(test['date'])
test['building'] = test['building'].astype('category')
test['type'] = test['type'].astype('category')

x_train = train.drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
x_test = test.drop(['num_date_time', 'date_time', 'date'], axis=1)
y_train = train['target']

dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(data=x_test, enable_categorical=True)

pred_df = pd.DataFrame()

for seed in tqdm(range(5)):
    param = {
        'reg_lambda': df['params_reg_lambda'][0] ,
        'gamma': df['params_gamma'][0],
        'reg_alpha': df['params_reg_alpha'][0] ,
        'colsample_bytree': df['params_colsample_bytree'][0] ,
        'subsample': df['params_subsample'][0] ,
        'max_depth': df['params_max_depth'][0],
        'min_child_weight': df['params_min_child_weight'][0],
        'seed':seed,
    }

    model = xgb.train(params=param, dtrain=dtrain, num_boost_round=best_it)

    y_pred = model.predict(dtest)
    pred_df.loc[:,seed] = y_pred

pred = pred_df.mean(axis=1)
preds = np.append(preds, pred)

submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
submission['answer'] = preds
submission.to_csv('./gu_all.csv', index = False)

del train, test, df, submission

100%|██████████| 5/5 [02:05<00:00, 25.19s/it]


In [None]:
byb = ['num_date_time', 'building', 'date_time', 'temp', 'wind', 'hum',
       'dow', 'month', 'week', 'dow_hour_mean', 'holiday',
       'holiday_mean', 'holiday_std', 'hour_mean', 'hour_std', 'sin_time',
       'cos_time', 'THI', 'WC', 'CDH', 'summer_cos', 'summer_sin', 'target']

train, test = get_train_and_test_data('byb')
train, test = train[byb], test[byb[:-1]]

scores = []
best_it = []

for b in tqdm(range(100)):
    y = train.loc[train.building == b+1, 'target']
    x = train.loc[train.building == b+1, ].iloc[:, 3:].drop(['target'], axis=1)
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)

    xgb = XGBRegressor(colsample_bytree=0.8, eta=0.1, max_depth=5,
         min_child_weight=6,n_estimators=1000, subsample=0.9, early_stopping_rounds=50)

    xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    y_pred = xgb.predict(x_valid)


    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb.best_iteration+1)

print(sum(scores)/len(scores))
print(sum(best_it)/len(best_it))
# 5.17304185151095
# 107.93

100%|██████████| 100/100 [00:26<00:00,  3.71it/s]

5.17304185151095
107.93





In [None]:
preds = np.array([])

for i in tqdm(range(100)):
    pred_df = pd.DataFrame()

    for seed in [0,1,2,3,4,5,6,7,8,9,10]:
        y_train = train.loc[train.building == i+1, 'target']
        x_train = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
        x_test = test.loc[test.building == i+1, ].iloc[:,3:]

        xgb = XGBRegressor(colsample_bytree=0.8, eta=0.1, max_depth=5, seed=seed,
             min_child_weight=6,n_estimators=best_it[i], subsample=0.9)

        xgb.fit(x_train, y_train)
        y_pred = xgb.predict(x_test)
        pred_df.loc[:,seed] = y_pred

    pred = pred_df.mean(axis=1)
    preds = np.append(preds, pred)

submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
submission['answer'] = preds
submission.to_csv('./byb.csv', index = False)
submission

del train, test, scores, best_it, submission

100%|██████████| 100/100 [03:10<00:00,  1.90s/it]


In [None]:
# 10시간 정도 소요됩니다.

# import optuna
# import optuna.logging

# optuna.logging.set_verbosity(optuna.logging.WARNING)
# train['date'] = pd.to_datetime(train['date'])
# train['building'] = train['building'].astype('category')
# train['type'] = train['type'].astype('category')

# x_train = train[train['date'] < '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
# x_valid = train[train['date'] >= '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
# y_train = train[train['date'] < '2022-08-18']['target']
# y_valid = train[train['date'] >= '2022-08-18']['target']

# dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
# dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)

# def objective(trial):
#     param = {
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
#         'gamma': trial.suggest_float('gamma', 1e-3, 10),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
#         'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
#         'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
#         'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6]),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#         'eta' : 0.1
#     }

#     model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
#                       evals=[(dvalid, 'valid')], early_stopping_rounds=50, verbose_eval=False)

#     preds = model.predict(dvalid)
#     smape = SMAPE(y_valid, preds)

#     return smape

# study = optuna.create_study(direction='minimize', study_name=None)
# with tqdm(total=500) as pbar:
#     def callback(study, trial):
#         pbar.update(1)

#     study.optimize(objective, n_trials=500, callbacks=[callback])

# df = study.trials_dataframe().sort_values(by=['value'], ascending=[True]).reset_index(drop=True)
# df.to_csv('parameters2.csv', index=False)
# df.head(5)

In [None]:
data = [
    [61, 61, 4.946742, '2023-08-18 11:59:30.066695', '2023-08-18 12:01:19.655572', '0 days 00:01:49.588877', 0.7, 0.695975, 6, 43, 6.357773, 2.568356, 1.0, 'COMPLETE'],
    [23, 23, 4.950789, '2023-08-18 11:24:36.282096', '2023-08-18 11:25:56.779154', '0 days 00:01:20.497058', 0.7, 0.023800, 6, 30, 6.278440, 1.460707, 1.0, 'COMPLETE'],
    [14, 14, 4.975757, '2023-08-18 11:16:59.783126', '2023-08-18 11:18:02.962945', '0 days 00:01:03.179819', 0.7, 0.966742, 6, 36, 3.762862, 0.362957, 1.0, 'COMPLETE'],
    [51, 51, 4.986851, '2023-08-18 11:51:40.355474', '2023-08-18 11:52:48.657127', '0 days 00:01:08.301653', 0.7, 1.034617, 6, 49, 5.606792, 1.690612, 1.0, 'COMPLETE'],
    [31, 31, 4.994619, '2023-08-18 11:32:34.984062', '2023-08-18 11:33:29.901314', '0 days 00:00:54.917252', 0.7, 0.094354, 6, 36, 5.006265, 1.527805, 1.0, 'COMPLETE'],
    [71, 71, 5.450572, '2023-08-14 13:17:40.337706', '2023-08-14 13:20:33.336589', '0 days 00:02:52.998883', 1.0, 4.632731, 6, 18, 2.979220, 7.641126, 0.7, 'COMPLETE'],
    [11, 11, 5.451309, '2023-08-14 11:00:11.878935', '2023-08-14 11:02:55.559494', '0 days 00:02:43.680559', 0.9, 8.375491, 6, 1, 4.829945, 7.857255, 0.7, 'COMPLETE'],
    [32, 32, 5.451501, '2023-08-14 11:48:49.899305', '2023-08-14 11:51:31.199312', '0 days 00:02:41.300007', 0.9, 4.436363, 6, 1, 3.906413, 6.982815, 0.7, 'COMPLETE'],
    [21, 21, 5.451785, '2023-08-14 11:22:30.440656', '2023-08-14 11:25:13.331292', '0 days 00:02:42.890636', 0.9, 3.917213, 6, 3, 4.037673, 8.432115, 0.7, 'COMPLETE'],
    [89, 89, 5.451788, '2023-08-14 14:07:31.956578', '2023-08-14 14:10:27.640167', '0 days 00:02:55.683589', 1.0, 4.768440, 6, 9, 3.470846, 8.564962, 0.7, 'COMPLETE']
]

columns = ['number', 'number', 'value', 'datetime_start', 'datetime_complete', 'duration', 'params_colsample_bytree', 'params_gamma', 'params_max_depth', 'params_min_child_weight', 'params_reg_alpha', 'params_reg_lambda', 'params_subsample', 'state']

df = pd.DataFrame(data, columns=columns)

df['datetime_start'] = pd.to_datetime(df['datetime_start'])
df['datetime_complete'] = pd.to_datetime(df['datetime_complete'])

In [None]:
def ratio_2(ratio):
    score = pd.read_csv('./score.csv')
    gu_byb = pd.read_csv('./gu_byb.csv')

    ratio = np.array(ratio)

    category_mappings = {
        'one': ([32, 33, 34, 35, 36], [0.997, 0.999, 0.996, 0.997, 0.995, 0.994, 0.996], [], True),
        'one_': ([56, 58], [0.997, 0.999, 0.998, 0.999, 0.995, 0.994, 0.996], [], True),
        'two': ([24, 25, 26, 27, 48, 49, 50], [0.987, 0.987, 0.985, 0.987, 0.984, 0.982, 0.985], [], True),
        'two_': ([23, 55], [0.987, 0.987, 0.995, 0.998, 0.984, 0.982, 0.985], [], True),
        'depart': ([37, 38, 39, 40, 41, 42, 43, 44, 85], [0.998]*7, [], False),
        'mart': ([86, 87, 88, 89, 90, 91, 92], [0.998]*7, [4], False),
        'aprt': ([64, 65, 66, 67, 68, 61, 62, 63], [0.985, 0.985, 0.985, 0.987, 0.98, 0.98, 0.987], [], True),
        'mon': ([2, 3, 54], [0.998]*7, [4], False),
        '5': ([5], [0.998]*7, [0,4,5,6], False),
        '8': ([8], [0.998]*7, [3], False)
    }

    for i in range(100):
        rest =  True
        for category, (ids, ratios, d, a_a) in category_mappings.items():
            if i + 1 in ids:
                rest = False
                for j in range(7):
                    if j in d:
                        continue
                    ran,ge = 168*i+24*j,168*i+24*(j+1)
                    if a_a:
                        if category == 'aprt':
                            ratio[ran:ge] *= ratios[j]
                        else:
                            ratio[ran:ge] = gu_byb.answer[ran:ge]*ratios[j]
                    else:
                        ratio[ran+9:ge-3] *= ratios[j]
                break
        if rest:
            for j in range(7):
                ran,ge = 168*i+24*j,168*i+24*(j+1)
                if j in [2, 3]:
                    continue
                ratio[ran+9:ge-3] *= ratios[j]

    return ratio

In [None]:
import xgboost as xgb

_all = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum',
       'type', 'all_area', 'cool_area', 'sun', 'dow', 'month',
       'week', 'avg_temp', 'max_temp', 'min_temp', 'temp_diff',
       'dow_hour_mean', 'holiday', 'holiday_mean', 'holiday_std',
       'hour_mean', 'hour_std', 'sin_time', 'cos_time', 'THI', 'WC', 'CDH', 'target']

train, test = get_train_and_test_data('all')
train, test = train[_all], test[_all[:-1]]

train['date'] = pd.to_datetime(train['date_time'])
train['building'] = train['building'].astype('category')
train['type'] = train['type'].astype('category')

x_train = train[train['date'] < '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
x_valid = train[train['date'] >= '2022-08-18'].drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
y_train = train[train['date'] < '2022-08-18']['target']
y_valid = train[train['date'] >= '2022-08-18']['target']

dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)

param = {
    'reg_lambda': df['params_reg_lambda'][0] ,
    'gamma': df['params_gamma'][0],
    'reg_alpha': df['params_reg_alpha'][0] ,
    'colsample_bytree': df['params_colsample_bytree'][0] ,
    'subsample': df['params_subsample'][0] ,
    'max_depth': df['params_max_depth'][0],
    'min_child_weight': df['params_min_child_weight'][0],
    'eta' : 0.1,
}


model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
                  evals=[(dvalid, 'valid')], early_stopping_rounds=50, verbose_eval=False)

preds = model.predict(dvalid)

smape = SMAPE(y_valid, preds)

score = smape
best_it = model.best_iteration+1
print(score, best_it) # 4.946742493265771 [844]

4.946742493265771 844


In [None]:
preds = np.array([])

test['date'] = pd.to_datetime(test['date_time'])
test['building'] = test['building'].astype('category')
test['type'] = test['type'].astype('category')

x_train = train.drop(['num_date_time', 'date_time', 'target', 'date'], axis=1)
x_test = test.drop(['num_date_time', 'date_time', 'date'], axis=1)
y_train = train['target']

dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(data=x_test, enable_categorical=True)

pred_df = pd.DataFrame()

i = 0
for seed in tqdm(range(5)):
    param = {
        'reg_lambda': df['params_reg_lambda'][0] ,
        'gamma': df['params_gamma'][0],
        'reg_alpha': df['params_reg_alpha'][0] ,
        'colsample_bytree': df['params_colsample_bytree'][0] ,
        'subsample': df['params_subsample'][0] ,
        'max_depth': df['params_max_depth'][0],
        'min_child_weight': df['params_min_child_weight'][0],
        'seed':seed,
        'eta':0.1
    }

    model = xgb.train(params=param, dtrain=dtrain, num_boost_round=best_it-100)

    y_pred = model.predict(dtest)
    pred_df.loc[:,seed] = y_pred

pred = pred_df.mean(axis=1)
preds = np.append(preds, pred)

submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
submission['answer'] = preds
submission.to_csv('./all.csv', index = False)

del train, test
del dtrain, dtest, x_train, x_test, y_train, pred, preds

100%|██████████| 5/5 [06:40<00:00, 80.12s/it]


In [None]:
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
test.columns = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum']
date = pd.to_datetime(test.date_time)
test['dow'] = date.dt.weekday
test['hour'] = date.dt.hour

score = pd.read_csv('./score.csv')
gu_byb = pd.read_csv('./gu_byb.csv')
gu_all = pd.read_csv('./gu_all.csv')
byb = pd.read_csv('./byb.csv')
xg_all = pd.read_csv('./all.csv')

ratio = []
for i in range(100):
    s1, s2 = score.score[i], score.score_all[i]
    b1 = gu_all.answer[i * 168 : (i + 1) * 168] * (s1 / (s1 + s2))
    b2 = gu_byb.answer[i * 168 : (i + 1) * 168] * (s2 / (s1 + s2)) * 0.965
    building = [i + j for i, j in zip(b1, b2)]
    ratio += building

test['ratio'] = ((gu_all.answer + gu_byb.answer) * 0.5 * 0.98 + ratio) * 0.5
test['gu_all'] = gu_all.answer

ratio_values = [0.98, 0.975, 0.975, 0.99, 0.99, 0.985, 0.985]
test['target'] = test['ratio'] * 0.5 + gu_all['answer'] * 0.5
test['target'] = test.apply(lambda row: row['target'] * ratio_values[row['dow']], axis=1)

ratio = ratio_2(np.array(test.target * 0.5 + (byb.answer * 0.5 + xg_all.answer * 0.5) * 0.5))

submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
submission['answer'] = ratio
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1802.679429
1,1_20220825 01,1753.577321
2,1_20220825 02,1636.800049
3,1_20220825 03,1567.376650
4,1_20220825 04,1611.254585
...,...,...
16795,100_20220831 19,857.328171
16796,100_20220831 20,799.987912
16797,100_20220831 21,718.364396
16798,100_20220831 22,617.148597
