## Valid_Set 설정

In [None]:
#Library Imports
import random
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
pd.options.display.max_rows = 200

In [None]:
tt=pd.read_csv('./data/train.csv')
te=pd.read_csv('./data/test.csv')

In [None]:
# 강수량 결측치 0으로 통일
tt['강수량(mm)'] = tt['강수량(mm)'].fillna(0)
te['강수량(mm)'] = te['강수량(mm)'].fillna(0)

In [None]:
tt = tt.drop(columns=['num_date_time', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
te = te.drop(columns=['num_date_time'])

In [None]:
건물번호_list = list(tt.건물번호.unique())
tt_fine = pd.DataFrame(columns=['건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)'])

# 시계열데이터에서 일반적으로 결측치 처리에 좋은 interpolate(보간법)을 이용하여 결측치 처리
# interpolate 특성상 건물별로 결측치 처리를 안하면 다른 건물 데이터가 들어갈수 있으므로 건물별로 결측치 처리
for i in range(len(건물번호_list)):
  tt_fine = pd.concat([tt_fine,tt.query(f'건물번호=={건물번호_list[i]}').interpolate(method='pad')])

tt = tt_fine.copy()

In [None]:
tt_date = pd.to_datetime(tt.일시)
te_date = pd.to_datetime(te.일시)
tt['일시'] = pd.to_datetime(tt.일시)
te['일시'] = pd.to_datetime(te.일시)

In [None]:
tt = tt[~((tt['일시'].dt.year == 2022) & (tt['일시'].dt.month == 6) & (tt['일시'].dt.day == 1))]

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = tt.copy()
X_test = te.copy()

In [None]:
건물번호_features = ['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)']

In [None]:
data_frames_train = {}
data_frames_test = {}

for 건물번호_value in range(1, 101):
    data_frames_train[f'X_train_{건물번호_value:03d}'] = X_train.query(f'건물번호 == {건물번호_value}')

for 건물번호_value in range(1, 101):
    data_frames_test[f'X_test_{건물번호_value:03d}'] = X_test.query(f'건물번호 == {건물번호_value}')

train_all = list(data_frames_train.keys())
test_list = list(data_frames_test.keys())

In [None]:
for i in range(0,100):
    train_all[i] = X_train.query(f'건물번호 =={i+1}')

for i in range(0,100):
    test_list[i] = X_test.query(f'건물번호 =={i+1}')

In [None]:
all_list = []

In [None]:
for i in range(0,100):
    all_list.append(pd.concat([train_all[i],test_list[i]]).reset_index(drop=True))

In [None]:
for i in range(0,100):
    all_list[i][건물번호_features] = scaler.fit_transform(all_list[i][건물번호_features])

In [None]:
tt = pd.DataFrame()
te = pd.DataFrame()

In [None]:
for i in range(0,100):
    tt = pd.concat([tt,all_list[i].iloc[:-168,:].reset_index(drop=True)])

In [None]:
for i in range(0,100):
    te = pd.concat([te,all_list[i].iloc[-168:,:].reset_index(drop=True)])

In [None]:
start_date = tt['일시'].min()
end_date = tt['일시'].max()
delta = pd.Timedelta(hours=167)

In [None]:
weekly_dataframes = []
while start_date <= end_date:
    end_of_week = start_date + delta
    weekly_df = tt[(tt['일시'] >= start_date) & (tt['일시'] <= end_of_week)]
    weekly_dataframes.append(weekly_df)
    start_date = end_of_week + pd.Timedelta(hours=1)

In [None]:
num_dataframes_01 = []
num_dataframes_02 = []
num_dataframes_03 = []
num_dataframes_04 = []
num_dataframes_05 = []
num_dataframes_06 = []
num_dataframes_07 = []
num_dataframes_08 = []
num_dataframes_09 = []
num_dataframes_10 = []
num_dataframes_11 = []
num_dataframes_12 = []

In [None]:
num_date_frame = [num_dataframes_01, num_dataframes_02, num_dataframes_03, num_dataframes_04,
                  num_dataframes_05, num_dataframes_06, num_dataframes_07, num_dataframes_08,
                  num_dataframes_09, num_dataframes_10, num_dataframes_11, num_dataframes_12]

In [None]:
for i in range(1,101):
    for j in range(0,12):
        num_date_frame[j].append(weekly_dataframes[j].query(f'건물번호=={i}'))

In [None]:
te_frame = []

In [None]:
for i in range(1,101):
    te_frame.append(te.query(f'건물번호=={i}'))

In [None]:
def cosine_similarity(u1, u2):
    dot_product = np.dot(u1, u2)

    norm_u1 = np.linalg.norm(u1)
    norm_u2 = np.linalg.norm(u2)

    if norm_u1 != 0 and norm_u2 != 0:
        similarity = dot_product / (norm_u1 * norm_u2)
        return similarity
    else:
        return 0

In [None]:
for i in range(0,100):
    for j in range(0,12):
        num_date_frame[j][i][['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)']] = num_date_frame[j][i][['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)']].astype(float)

In [None]:
for i in range(0,100):
    for j in range(0,12):
        num_date_frame[j][i] = num_date_frame[j][i].reset_index(drop=True)

In [None]:
for i in range(0,100):
    te_frame[i] = te_frame[i].reset_index(drop=True)

In [None]:
기온_list = []
강수_list = []
풍속_list = []
습도_list = []

# 100까지의 값 추가
for i in range(1, 101):
    기온_list.append({})
    강수_list.append({})
    풍속_list.append({})
    습도_list.append({})

In [None]:
for j in range(0,100):
    for i in range(0,12):
        기온_list[j][i] = cosine_similarity(num_date_frame[i][j]['기온(C)'], te_frame[j]['기온(C)'])
        강수_list[j][i] = cosine_similarity(num_date_frame[i][j]['강수량(mm)'], te_frame[j]['강수량(mm)'])
        풍속_list[j][i] = cosine_similarity(num_date_frame[i][j]['풍속(m/s)'], te_frame[j]['풍속(m/s)'])
        습도_list[j][i] = cosine_similarity(num_date_frame[i][j]['습도(%)'], te_frame[j]['습도(%)'])

In [None]:
전체_list = 기온_list.copy()

In [None]:
for i in range(0,100):
    for key in 전체_list[i]:
        if key in 강수_list[i]:
            전체_list[i][key] += 강수_list[i][key]
        if key in 풍속_list[i]:
            전체_list[i][key] += 풍속_list[i][key]
        if key in 습도_list[i]:
            전체_list[i][key] += 습도_list[i][key]

In [None]:
sorted_전체_list = []

In [None]:
for i in range(0,100):
    sorted_전체_list.append(dict(sorted(전체_list[i].items(), key=lambda x: x[1], reverse=True)))

In [None]:
valid_list = []

In [None]:
for i in range(0,100):
    valid_list.append(list(sorted_전체_list[i].keys())[:3])

In [None]:
#pd.DataFrame(valid_list).to_csv('valid_list.csv', index=False)