<a href="https://colab.research.google.com/github/JHyuk2/DACON/blob/main/DACON_%EC%A0%84%EB%A0%A5%EC%98%88%EC%B8%A12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/myDrive')

Mounted at /content/myDrive


In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from math import sqrt

In [None]:
path = '/content/myDrive/MyDrive/data/datasets/전력예측/'
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')
building_info_df = pd.read_csv(path +'building_info.csv')
sample_submission = pd.read_csv(path +'sample_submission.csv')

# torch로 하는 lstm

In [None]:
# seed 고정하기.
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [None]:
building_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   건물번호          100 non-null    int64  
 1   건물유형          100 non-null    object 
 2   연면적(m2)       100 non-null    float64
 3   냉방면적(m2)      100 non-null    float64
 4   태양광용량(kW)     100 non-null    object 
 5   ESS저장용량(kWh)  100 non-null    object 
 6   PCS용량(kW)     100 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB


In [None]:
building_info_df.head()

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


In [None]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


## 1. 결측값 처리

In [None]:
# 기상데이터는 전력수요랑 연관이 있다.
# 일조량 강수량 일사량 결측값이 너무 많다.
train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
일조(hr)            75182
일사(MJ/m2)         87913
전력소비량(kWh)            0
dtype: int64

In [None]:
# 결측값 처리 및 시간 columns 추가
import datetime

def preprocessing(df):
  # 강수량에 대해서는 0값으로 대치
  df['강수량(mm)'].fillna(0,inplace=True)

  # 월, 일, 시간을 나누어줌
  df['month'] = df['일시'].apply(lambda x: int(x[4:6]))
  df['day'] = df['일시'].apply(lambda x: int(x[6:8]))
  df['hour'] = df['일시'].apply(lambda x: int(x[9:11]))

  # 평일과 주말을 구분하기 위해 weekday, weekend 설정.
  df['weekday'] = df['일시'].apply(lambda x: datetime.date(2022, int(x[4:6]), int(x[6:8])).weekday())
  df['weekend'] = df['weekday'].apply(lambda x: 0 if x <= 4 else 1)

  return df

In [None]:
# 그냥 drop 하는게 맞을듯
# drop columns

columns_to_drop = ['일조(hr)', '일사(MJ/m2)']
train_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
train_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '전력소비량(kWh)'],
      dtype='object')

In [None]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)

In [None]:
imputer = SimpleImputer(strategy='median')  # 또는 다른 적절한 전략 선택
columns_to_fill = ['습도(%)', '풍속(m/s)',]
train_df[columns_to_fill] = imputer.fit_transform(train_df[columns_to_fill])

In [None]:
# train_df와 building_info_df를 건물번호를 기준으로 merge
train_df = pd.merge(train_df, building_info_df, on='건물번호', how='left')

# test_df와 building_info_df를 건물번호를 기준으로 merge
test_df = pd.merge(test_df, building_info_df, on='건물번호', how='left')

In [None]:
train_df.isnull().sum()

num_date_time    0
건물번호             0
일시               0
기온(C)            0
강수량(mm)          0
풍속(m/s)          0
습도(%)            0
전력소비량(kWh)       0
month            0
day              0
hour             0
weekday          0
weekend          0
건물유형             0
연면적(m2)          0
냉방면적(m2)         0
태양광용량(kW)        0
ESS저장용량(kWh)     0
PCS용량(kW)        0
dtype: int64

In [None]:
train_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '전력소비량(kWh)', 'month', 'day', 'hour', 'weekday', 'weekend', '건물유형',
       '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'],
      dtype='object')

In [None]:
from sklearn.preprocessing import MinMaxScaler

# 스케일러 초기화
custom_feature_range = (0.1, 0.9)
scaler = MinMaxScaler(feature_range=custom_feature_range)

# train_df의 냉방면적 스케일링
train_df['냉방면적(m2)_scaled'] = scaler.fit_transform(train_df[['냉방면적(m2)']])

# test_df의 냉방면적도 동일한 스케일러로 변환
test_df['냉방면적(m2)_scaled'] = scaler.transform(test_df[['냉방면적(m2)']])

# 원래의 냉방면적 컬럼은 삭제해도 됩니다.
train_df.drop(columns=['냉방면적(m2)'], inplace=True)
test_df.drop(columns=['냉방면적(m2)'], inplace=True)

In [None]:
# correlation check

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# sMAPE 손실 함수 정의
def smape_loss(true, predicted):
    epsilon = 1e-10 # 작은 값, 0으로 나누는 것을 방지
    true_o = true
    pred_o = predicted
    summ = np.abs(true_o) + np.abs(pred_o) + epsilon
    smape = np.abs(pred_o - true_o) / summ * 2.0
    return np.mean(smape)

In [None]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),month,day,hour,weekday,weekend,건물유형,연면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW),냉방면적(m2)_scaled
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,1085.28,6,1,0,2,0,건물기타,110634.0,-,-,-,0.102893
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,1047.36,6,1,1,2,0,건물기타,110634.0,-,-,-,0.102893
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,974.88,6,1,2,2,0,건물기타,110634.0,-,-,-,0.102893
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,953.76,6,1,3,2,0,건물기타,110634.0,-,-,-,0.102893
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,986.4,6,1,4,2,0,건물기타,110634.0,-,-,-,0.102893


In [None]:
# 피처 선택
feature_columns = ['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
                   'month', 'day', 'hour', 'weekday', 'weekend', '냉방면적(m2)_scaled']

target_column = '전력소비량(kWh)'  # 예측할 타겟 컬럼

X_train = train_df[feature_columns]
y_train = train_df[target_column]

# 이거로 y값에 대해 예측해야함.
X_test = test_df[feature_columns]

In [None]:
pd.concat([X_train, y_train], axis=1).corr()

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend,냉방면적(m2)_scaled,전력소비량(kWh)
기온(C),1.0,-0.068643,0.220174,-0.389834,0.4271998,0.06949836,0.2914288,0.04288145,0.09065367,-0.003055486,0.17119
강수량(mm),-0.068643,1.0,0.048094,0.19957,0.04343621,0.02710738,0.01372893,-0.05596441,-0.05948565,-0.002753116,0.015083
풍속(m/s),0.220174,0.048094,1.0,-0.219093,-0.08634987,0.1002209,0.1770952,-0.1065863,-0.09137064,-0.05379001,0.109953
습도(%),-0.389834,0.19957,-0.219093,1.0,0.253921,0.1184135,-0.2805908,-0.01338743,-0.03368183,-0.02051841,-0.129548
month,0.4272,0.043436,-0.08635,0.253921,1.0,-0.1333186,-8.672975e-17,-0.0377367,-0.01006946,5.181644e-15,0.055842
day,0.069498,0.027107,0.100221,0.118414,-0.1333186,1.0,2.2105570000000003e-17,0.02227065,0.03079794,-5.90593e-16,0.009233
hour,0.291429,0.013729,0.177095,-0.280591,-8.672975e-17,2.2105570000000003e-17,1.0,-2.543505e-18,-2.417248e-19,-3.588211e-16,0.099594
weekday,0.042881,-0.055964,-0.106586,-0.013387,-0.0377367,0.02227065,-2.543505e-18,1.0,0.7912646,1.722227e-15,-0.0436
weekend,0.090654,-0.059486,-0.091371,-0.033682,-0.01006946,0.03079794,-2.417248e-19,0.7912646,1.0,1.272109e-16,-0.057888
냉방면적(m2)_scaled,-0.003055,-0.002753,-0.05379,-0.020518,5.181644e-15,-5.90593e-16,-3.588211e-16,1.722227e-15,1.272109e-16,1.0,0.016444


In [None]:
def create_sequences(data, target, window_size=24):
    X = []
    y = []

    for i in range(len(data) - window_size):
        X.append(data.iloc[i:i+window_size].values)
        y.append(target[i+window_size+1])

    return np.array(X), np.array(y)

X, y = create_sequences(X_train, y_train)  # y_train은 전력소비량(kWh)의 별도의 배열이어야 합니다.


In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, BatchNormalization

model = Sequential()

# CNN layers
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(24, X_train.shape[1])))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))  # Dropout layer 추가
model.add(BatchNormalization())  # BatchNormalization layer 추가

# LSTM layer
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dropout(0.2))  # Dropout layer 추가
model.add(BatchNormalization())  # BatchNormalization layer 추가

model.add(LSTM(50))
model.add(Dropout(0.2))  # Dropout layer 추가
model.add(BatchNormalization())  # BatchNormalization layer 추가

# Dense layer
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')



In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Early stopping 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)

# Model checkpoint 설정
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

callbacks = [early_stopping, model_checkpoint]

# fit 함수에 callbacks 인자 추가
model.fit(X, y, epochs=20, batch_size=64, validation_split=0.2, callbacks=callbacks)

Epoch 1/20
Epoch 1: val_loss improved from inf to 2488357.50000, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 2488357.50000 to 1794481.12500, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss did not improve from 1794481.12500
Epoch 4/20
Epoch 4: val_loss did not improve from 1794481.12500
Epoch 5/20
Epoch 5: val_loss did not improve from 1794481.12500
Epoch 6/20
Epoch 6: val_loss did not improve from 1794481.12500
Epoch 7/20
Epoch 7: val_loss did not improve from 1794481.12500
Epoch 8/20
Epoch 8: val_loss improved from 1794481.12500 to 837064.68750, saving model to best_model.h5
Epoch 9/20
Epoch 9: val_loss did not improve from 837064.68750
Epoch 10/20
Epoch 10: val_loss did not improve from 837064.68750
Epoch 11/20
Epoch 11: val_loss improved from 837064.68750 to 724988.18750, saving model to best_model.h5
Epoch 12/20
Epoch 12: val_loss did not improve from 724988.18750
Epoch 13/20
Epoch 13: val_loss did not improve from 724988.18750
Epoch 14/20
E

<keras.callbacks.History at 0x7e70dc6d98d0>

In [None]:
model.fit(X, y, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: ignored

In [None]:
from tensorflow.keras.models import load_model

# 저장된 모델 파일 로드
loaded_model = load_model('best_model.h5')

loaded_model.predict(X_test_sequecne)

In [None]:
from tensorflow.keras.models import load_model

def create_test_sequences(data, window_size=24):
    sequences = []

    for i in range(len(data) - window_size + 1):
        sequences.append(data.iloc[i:i+window_size].values)

    return np.array(sequences)


last_train = X_train.tail(24)
X_test_seq = pd.concat([last_train, X_test], axis=0).reset_index(drop=True)
X_test_sequences = create_test_sequences(X_test_seq)


# 저장된 모델 파일 로드
loaded_model = load_model('best_model.h5')

predictions = loaded_model.predict(X_test_sequences)






In [None]:
# 제출 파일 생성
sample_submission['answer'] = predictions[1:].flatten()  # 예측 값이 2차원 배열 형태로 나오기 때문에 1차원으로 변환
sample_submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1318.827881
1,1_20220825 01,1371.443726
2,1_20220825 02,1318.039795
3,1_20220825 03,1231.755249
4,1_20220825 04,1203.043579
...,...,...
16795,100_20220831 19,1298.837646
16796,100_20220831 20,1204.521240
16797,100_20220831 21,958.544861
16798,100_20220831 22,875.678772


In [None]:
sample_submission

In [None]:
TRAIN_SPLIT = int(len(X_train) * 0.8)  # X_train의 길이의 80%를 기준으로 TRAIN_SPLIT 설정

x_train_multi, y_train_multi = X_train[:TRAIN_SPLIT], y_train[:TRAIN_SPLIT]
x_val_multi, y_val_multi = X_train[TRAIN_SPLIT:], y_train[TRAIN_SPLIT:]

## 전처리 2.
1) month, day, hour, weekend 범주형으로 처리하기  
2) 나머지 값들은 scaling  
3)

---



In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   기온(C)            204000 non-null  float64
 1   강수량(mm)          204000 non-null  float64
 2   풍속(m/s)          204000 non-null  float64
 3   습도(%)            204000 non-null  float64
 4   month            204000 non-null  int64  
 5   day              204000 non-null  int64  
 6   hour             204000 non-null  int64  
 7   weekday          204000 non-null  int64  
 8   weekend          204000 non-null  int64  
 9   냉방면적(m2)_scaled  204000 non-null  float64
dtypes: float64(5), int64(5)
memory usage: 17.1 MB


In [None]:
y_train_multi.shape

(163200,)

In [None]:
x_train_multi.shape

(163200, 10)

In [None]:
# 이전에 지정한 값들:
n_seq = 1      # 시퀀스 내부를 더 작은 시퀀스로 나눌 수 있는 경우를 위해 설정, 현재는 1로 설정
n_steps = 24   # 윈도우 크기
n_features = 10  # 특성 수

# DataFrame을 numpy 배열로 변환:
x_train_multi_np = x_train_multi.to_numpy()
x_val_multi_np = x_val_multi.to_numpy()

# 데이터의 형태를 변경합니다.
x_train_multi_np = x_train_multi_np.reshape((x_train_multi_np.shape[0] // n_steps, n_steps, n_features))
x_val_multi_np = x_val_multi_np.reshape((x_val_multi_np.shape[0] // n_steps, n_steps, n_features))

In [None]:
BUFFER_SIZE = len(x_train_multi_np)
BATCH_SIZE = 256
EVALUATION_INTERVAL = len(x_train_multi_np) // BATCH_SIZE
EPOCHS = 40
past_history =24
future_target = 1
STEP = 1

# TensorFlow 데이터셋으로 변환합니다.
train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi_np, y_train_multi[:len(x_train_multi_np)]))
train_data_multi = train_data_multi.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi_np, y_val_multi[:len(x_val_multi_np)]))
val_data_multi = val_data_multi.batch(BATCH_SIZE).repeat()

In [None]:
train_data_multi

<_RepeatDataset element_spec=(TensorSpec(shape=(None, 24, 10), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [None]:
n_features = X_train.shape[1]  # X_train의 컬럼 수

model = tf.keras.models.Sequential([
    # 입력 데이터의 형태를 지정합니다.
    tf.keras.layers.Input(shape=(past_history, n_features)),

    # TimeDistributed 레이어를 사용하여 각 시퀀스에 Conv1D를 적용합니다.
    tf.keras.layers.TimeDistributed(Conv1D(filters=64, kernel_size=1, activation='relu')),
    tf.keras.layers.TimeDistributed(MaxPooling1D(pool_size=2)),
    tf.keras.layers.TimeDistributed(Flatten()),

    # LSTM 레이어를 사용하여 시퀀스 데이터를 처리합니다.
    tf.keras.layers.LSTM(50, return_sequences=True, activation='relu'),
    tf.keras.layers.LSTM(50, activation='relu'),

    # 예측을 수행하는 Dense 레이어
    tf.keras.layers.Dense(future_target)
])

model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss='mae')

NameError: ignored

In [None]:
checkpoint_path = "training_2_"  # cols[0]이 무엇인지 모르므로 적절한 이름을 지정하십시오.
end_point = "cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.join(path, checkpoint_path, end_point)  # main_path도 무엇인지 모르므로 적절한 경로를 지정하십시오.

cp_callback = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7),
    tf.keras.callbacks.ModelCheckpoint(checkpoint_dir, save_weights_only=True, verbose=1, monitor='val_loss', save_best_only=True)
]

history = model.fit(
    train_data_multi,
    epochs=EPOCHS,
    steps_per_epoch=EVALUATION_INTERVAL,
    validation_data=val_data_multi,
    validation_steps=50,
    callbacks=cp_callback
)

In [None]:
# 체크포인트와 조기 종료 콜백 설정
checkpoint_path = "training_checkpoints.ckpt"
cp_callback = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_loss', save_best_only=True, verbose=1)
]

# 모델 훈련
history = model.fit(
    train_data_multi,
    epochs=EPOCHS,
    steps_per_epoch=EVALUATION_INTERVAL,
    validation_data=val_data_multi,
    validation_steps=50,
    callbacks=cp_callback
)

In [None]:
model.load_weights(checkpoint_path)

# 예측 수행
for x, y in val_data_multi.take(3):
    multi_step_plot(x[0], y[0], model.predict(x)[0])

In [None]:
# 모델 구성

multi_step_model = tf.keras.models.Sequential()
multi_step_model.add(TimeDistributed(Conv1D(filters=64, kernel_size=1, activation='relu'), input_shape=(None, n_steps, n_features)))
multi_step_model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
multi_step_model.add(TimeDistributed(Flatten()))

multi_step_model.add(tf.keras.layers.LSTM(16, return_sequences=True, input_shape=x_train_multi.shape[-2:], activation='softsign'))
multi_step_model.add(tf.keras.layers.LSTM(8, activation='relu'))
multi_step_model.add(tf.keras.layers.Dense(24))

### 콜백 함수 정의

학습 중에 사용 콜백을 정의합니다. 이 콜백에는 조기 중단(EarlyStopping) 및   체크포인트 저장(ModelCheckpoint)이 포함되어 있습니다.

In [None]:
# 콜백
cp_callback = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7),
    tf.keras.callbacks.ModelCheckpoint(checkpoint_dir, save_weights_only=True, verbose=1, monitor='val_loss', save_best_only=True)
]



In [None]:
multi_step_model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss='mae')
EVALUATION_INTERVAL = 200
EPOCHS = 40

multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL, validation_data=val_data_multi, validation_steps=50)

In [None]:
def create_sequences(X_data, y_data, window_size=24):
    X, y = [], []
    for i in range(len(X_data) - window_size):
        X.append(X_data[i:i+window_size])
        y.append(y_data[i + window_size])
    return np.array(X), np.array(y)


In [None]:
X_train_seq, y_train_seq = create_sequences(X_train, y_train)

In [None]:
X_train_seq.shape

(203976, 24, 10)

In [None]:
y_train_seq.shape

(203976, 1)

In [None]:
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(LSTM(60, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model.add(Dense(1))

optimizer = Adam(learning_rate=0.0005)
model.compile(loss='mse', optimizer=optimizer)



In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

# 체크포인트 경로 설정
checkpoint_filepath = 'best_model_weights.h5'

# ModelCheckpoint 콜백 생성
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True, # 가장 좋은 모델만 저장
    monitor='val_loss', # 검증 손실을 기준으로
    mode='min', # 최소화
    verbose=1
)

# EarlyStopping 콜백 생성
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=20, # 10 epochs 동안 val_loss가 개선되지 않으면 학습 중단
    verbose=1
)

# 빠른 학습을 위한 파라미터 조절
batch_size = 256  # 배치 크기 증가
epochs = 50  # epoch 수 감소

# 모델 학습 시 콜백으로 전달
model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[checkpoint_callback, early_stopping_callback])



KeyboardInterrupt: ignored

In [None]:
def create_test_sequences(data, feature_columns, window_size=24):
    X = []
    for i in range(len(data) - window_size):
        X.append(data.iloc[i:i+window_size][feature_columns].values)
    return np.array(X)

In [None]:
# Test 데이터를 이용한 시퀀스 생성
# train의 마지막 24시간 데이터와 test 데이터를 합쳐서 시퀀스 생성
combined_data = np.vstack((train_df[feature_columns].iloc[-24:].values, test_df[feature_columns].values))

# combined_data를 데이터프레임 형태로 변환
combined_df = pd.DataFrame(combined_data, columns=feature_columns)

# 테스트 시퀀스 생성
X_test = create_test_sequences(combined_df, feature_columns, window_size=24)

In [None]:
# 최적의 모델 가중치 불러오기
model.load_weights(checkpoint_filepath)

predictions = model.predict(X_test)
# # 스케일링된 X_test를 사용하여 예측
# y_pred_scaled = model.predict(X_test)

# # 스케일링 역변환
# y_pred_original = y_scaler.inverse_transform(y_pred_scaled)



In [None]:
predictions[:20]

array([[1569.0565],
       [1594.8502],
       [1625.6116],
       [1661.8611],
       [1710.1882],
       [1771.1085],
       [1854.3898],
       [2009.4423],
       [2236.3193],
       [2447.8557],
       [2549.8184],
       [2632.1704],
       [2647.1824],
       [2648.614 ],
       [2622.165 ],
       [2589.929 ],
       [2639.3477],
       [2672.2825],
       [2478.517 ],
       [2342.985 ]], dtype=float32)

In [None]:

# 제출 파일 생성
sample_submission['answer'] = predictions.flatten()  # 예측 값이 2차원 배열 형태로 나오기 때문에 1차원으로 변환
sample_submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1569.056519
1,1_20220825 01,1594.850220
2,1_20220825 02,1625.611572
3,1_20220825 03,1661.861084
4,1_20220825 04,1710.188232
...,...,...
16795,100_20220831 19,2409.061523
16796,100_20220831 20,2247.255859
16797,100_20220831 21,2075.650391
16798,100_20220831 22,1894.940186


In [None]:
sample_submission.to_csv(path+ '/sample_submission4', index=False)

CNN + LSTM

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.layers import LSTM, Dense, Flatten, TimeDistributed, Conv1D, MaxPooling1D, Dropout

def create_sequences(data, feature_columns, target_column, window_size=24):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data.iloc[i:i+window_size][feature_columns].values)
        y.append(data.iloc[i+window_size][target_column])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_df, feature_columns, target_column, window_size=24)

In [None]:
X_train.shape

(203976, 24, 10)

In [None]:
y_train.shape

(203976,)

In [None]:

# # 모델 학습
# model2.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

from keras.layers import RepeatVector

# CNN-LSTM 모델 생성
model2 = Sequential()

# TimeDistributed Layer를 사용하여 각 시퀀스에 독립적으로 CNN을 적용
model2.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model2.add(MaxPooling1D(pool_size=3))
model2.add((Flatten()))

# dropout 문제인가?
model2.add(RepeatVector(y_train.shape[1] if len(y_train.shape) > 1 else 1))
model2.add(LSTM(50, activation='relu', return_sequences=True))
model2.add(Dropout(0.2))
model2.add(LSTM(30))
model2.add(Dense(1))

model2.compile(optimizer='adam', loss='mse')

# 모델 학습
model2.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7c4133ff5fc0>

In [None]:
# CNN + LSTM prediction
predictions2 = model2.predict(X_test)



In [None]:
predictions2[:29]

array([[1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971],
       [1117.7971]], dtype=float32)

In [None]:
s

In [None]:
train_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '전력소비량(kWh)', 'month', 'day', 'hour', 'weekday', 'weekend', '건물유형',
       '연면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', '냉방면적(m2)_scaled'],
      dtype='object')

In [None]:
# 1. 트레이닝 데이터의 마지막 24 타임스탬프와 테스트 데이터를 결합
combined = np.vstack((train_df.iloc[-24:, 1:-1].values, test_df.iloc[:, 2:].values))

# 2. 테스트 시퀀스 생성
X_test = []
for i in range(24, combined.shape[0]):
    X_test.append(combined[i-24:i, :])

X_test = np.array(X_test)

# 3. 모델로부터 예측 받기
predictions = model.predict(X_test)

# 4. 처음 24개 예측 값 무시
final_predictions = predictions[24:]

In [None]:

# 시퀀스 데이터 생성
X_test = create_test_sequences(test_df, feature_columns, window_size=24)

In [None]:
predictions = model.predict(X_test)



In [None]:

predictions

array([[1549.4161],
       [1579.1659],
       [1821.3915],
       ...,
       [2006.3325],
       [1775.1696],
       [1647.6215]], dtype=float32)

In [None]:
predicted_df = pd.DataFrame(predictions, columns=['전력소비량(kWh)'])

제출파일 생성

In [None]:
sample_submission['answer'] = predictions
sample_submission.head()

ValueError: ignored

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# sMAPE 손실 함수 정의
def smape_loss(true, predicted):
    epsilon = 1e-10 # 작은 값, 0으로 나누는 것을 방지
    true_o = true
    pred_o = predicted
    summ = np.abs(true_o) + np.abs(pred_o) + epsilon
    smape = np.abs(pred_o - true_o) / summ * 2.0
    return np.mean(smape)

In [None]:
feature_columns = ['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
                   'month', 'day', 'hour', 'weekday', 'weekend', '냉방면적(m2)']


# 시퀀스 길이 설정 (예: 하루 24시간)
sequence_length = 24

# 데이터 분할 및 재구성
X = merged_train[feature_columns].values
y = merged_train['전력소비량(kWh)'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# # 입력 데이터 재구성
# X_train = X_train.reshape(-1, sequence_length, len(feature_columns))
# X_val = X_val.reshape(-1, sequence_length, len(feature_columns))

In [None]:
sequence_length = 24  # 시퀀스 길이 (Window Size)
X_train_reshaped = []
y_train_reshaped = []

for i in range(len(X) - sequence_length + 1):
    X_train_reshaped.append(X[i:i+sequence_length])
    y_train_reshaped.append(y[i+sequence_length-1])

X_train_reshaped = np.array(X_train_reshaped)
y_train_reshaped = np.array(y_train_reshaped)

# 모델 생성 및 컴파일
model = Sequential()
model.add(LSTM(128, input_shape=(sequence_length, len(feature_columns)), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
model.fit(X_train_reshaped, y_train_reshaped, epochs=5, batch_size=64, validation_split=0.1)

# 테스트 데이터 예측
X_test = merged_test[feature_columns].values
X_test_reshaped = []

for i in range(len(X_test) - sequence_length + 1):
    X_test_reshaped.append(X_test[i:i+sequence_length])

X_test_reshaped = np.array(X_test_reshaped)

# 예측
predicted_power = model.predict(X_test_reshaped).squeeze()

# 예측 결과 출력
print("Predicted Power Consumption (kWh):", predicted_power)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 582/2869 [=====>........................] - ETA: 3:52 - loss: nan

KeyboardInterrupt: ignored

In [None]:
X_train.shape

(183600, 10)

In [None]:
# 입력 데이터 재구성
X_train_reshaped = []
y_train_reshaped = []

for i in range(len(X_train) - sequence_length + 1):
    X_train_reshaped.append(X_train[i:i+sequence_length])
    y_train_reshaped.append(y_train[i+sequence_length-1])

X_train_reshaped = np.array(X_train_reshaped)
y_train_reshaped = np.array(y_train_reshaped)

# 모델 학습
model.fit(X_train_reshaped, y_train_reshaped, epochs=5, batch_size=64)

In [None]:
# LSTM 모델 생성
model = Sequential()
model.add(LSTM(128, input_shape=(sequence_length, len(feature_columns)), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))



In [None]:
# SMAPE 손실 함수 정의
def smape_loss(y_true, y_pred):
    return 100 * tf.reduce_mean(2 * tf.abs(y_pred - y_true) / (tf.abs(y_true) + tf.abs(y_pred) + 1e-8))

# 모델 컴파일
model.compile(optimizer='adam', loss=smape_loss)

## 2. 건물에 대한 특징을 다음과 같이 나눌 수 있다.
- 공장 : 항상 잘 돌아감
- 사무실 : 평일 9 to 6
- 음식점 : 밤에 더 전력 수요가 많음
- 쉼터 : 주말에 더 전력을 많이 키는 곳

### 클러스터링 방법.
- K-means (silhouette, elbow method)  
- DBSCAN
- Hierarchical clustering

### scaling할 때 building info를 사용해서 넓은 면적일수록 값을 낮출 수 있게 해줌.




In [None]:
import numpy as np

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

def evaluate_model(model, X, y_true):
    y_pred = model.predict(X)
    return smape(y_true, y_pred)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# 데이터 불러오기
# train_df = ... (데이터프레임을 불러온다고 가정)

# 필요한 컬럼 선택
selected_columns = ['기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)', '일사(MJ/m2)', 'month', 'day', 'hour', 'weekday', 'weekend', '전력소비량(kWh)']
data = train_df[selected_columns]

# 데이터 스케일링
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# 입력 시퀀스 생성
sequence_length = 24  # 예측을 위한 과거 시간 스텝 수
X = []
y = []

for i in range(len(scaled_data) - sequence_length):
    X.append(scaled_data[i:i+sequence_length, :-1])  # 마지막 열을 제외한 나머지를 입력
    y.append(scaled_data[i+sequence_length, -1])  # 마지막 열인 '전력소비량(kWh)'을 타겟

X = np.array(X)
y = np.array(y)

# 학습 및 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# LSTM 모델 생성
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))

# 테스트 데이터 예측
test_samples = scaled_data[-sequence_length:, :-1]
test_samples = np.expand_dims(test_samples, axis=0)
predicted_power_consumption = model.predict(test_samples)

# 예측 결과 출력
predicted_power_consumption = scaler.inverse_transform(np.hstack((test_samples[0], predicted_power_consumption)))
print("Predicted Power Consumption (kWh):", predicted_power_consumption[-1, -1])

Epoch 1/50
Epoch 2/50

KeyboardInterrupt: ignored

In [None]:
# 건물 한 개당 2040개의 시간 정보를 담고있음. (24 * 85일간의 정보)
train_df.건물번호.value_counts()

1      2040
64     2040
74     2040
73     2040
72     2040
       ... 
31     2040
30     2040
29     2040
28     2040
100    2040
Name: 건물번호, Length: 100, dtype: int64

In [None]:
# X_train = train_df[['건물번호', '기온(C)', '풍속(m/s)', '습도(%)','month','day','hour']]
#  '전력소비량(kWh)'
X_train = train_df.drop(['num_date_time', '일시','일조(hr)', '일사(MJ/m2)'], axis=1)
y_train = train_df['전력소비량(kWh)']

In [None]:
X_train

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),month,day,hour,weekday,weekend
0,1,18.6,0.0,0.9,42.0,1085.28,6,1,0,2,0
1,1,18.0,0.0,1.1,45.0,1047.36,6,1,1,2,0
2,1,17.7,0.0,1.5,45.0,974.88,6,1,2,2,0
3,1,16.7,0.0,1.4,48.0,953.76,6,1,3,2,0
4,1,18.4,0.0,2.8,43.0,986.40,6,1,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,881.04,8,24,19,2,0
203996,100,22.4,0.0,1.3,86.0,798.96,8,24,20,2,0
203997,100,21.3,0.0,1.0,92.0,825.12,8,24,21,2,0
203998,100,21.0,0.0,0.3,94.0,640.08,8,24,22,2,0


In [None]:
X_train = X_train[X_train.columns[:5].to_list() + X_train.columns[6:].to_list() + X_train.columns[5:6].to_list()]
X_train.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend,전력소비량(kWh)
0,1,18.6,0.0,0.9,42.0,6,1,0,2,0,1085.28
1,1,18.0,0.0,1.1,45.0,6,1,1,2,0,1047.36
2,1,17.7,0.0,1.5,45.0,6,1,2,2,0,974.88
3,1,16.7,0.0,1.4,48.0,6,1,3,2,0,953.76
4,1,18.4,0.0,2.8,43.0,6,1,4,2,0,986.4


In [None]:
# # loss function : SMAPE 정의
# from sklearn.metrics import mean_absolute_error, make_scorer

# def smape(true, pred):
#     true = np.array(true)  # np.array로 바꿔야 에러 없음
#     pred = np.array(pred)
#     return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))  # *2 , *100은 상수이므로 생략

# SMAPE = make_scorer(smape, greater_is_better=False)  # smape 값이 작아져야하므로 False

# LSTM

In [None]:
# # 강수량 결측치 0.0으로 채우기
# # train_df['강수량(mm)'].fillna(0.0, inplace=True)

# # 풍속, 습도 결측치 평균으로 채우고 반올림하기
# X_train['일조(hr)'].fillna(round(X_train['일조(hr)'].mean(),2), inplace=True)
# X_train['일사(MJ/m2)'].fillna(round(X_train['일사(MJ/m2)'].mean(),2), inplace=True)

In [None]:
X_train.columns

Index(['건물번호', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', 'month', 'day', 'hour',
       'weekday', 'weekend'],
      dtype='object')

In [None]:
X_train.shape

(204000, 11)

In [None]:
# 하이퍼파라미터
input_size = 11  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [None]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(X_train.values)
train_loader = create_data_loader(train_data, window_size=24, batch_size=64)

In [None]:
train_data.shape

(204000, 11)

In [None]:
test_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       'month', 'day', 'hour', 'weekday', 'weekend'],
      dtype='object')

In [None]:
test_df.drop(['num_date_time', '일시'], axis=1, inplace=True)

In [None]:
test_df.shape

(16800, 10)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [None]:
learning_rate = 0.002
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size=input_size, hidden_size=20, num_layers=2, output_size=1).to(device)

criterion = nn.MSELoss()
# criterion = SMAPE
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: cpu


In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # loss = SMAPE(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

Epoch [1/5], Step [300/3188], Loss: 0.0005
Epoch [1/5], Step [600/3188], Loss: 0.0004
Epoch [1/5], Step [900/3188], Loss: 0.0009
Epoch [1/5], Step [1200/3188], Loss: 0.0021
Epoch [1/5], Step [1500/3188], Loss: 0.0213
Epoch [1/5], Step [1800/3188], Loss: 0.0037
Epoch [1/5], Step [2100/3188], Loss: 0.0000
Epoch [1/5], Step [2400/3188], Loss: 0.0001
Epoch [1/5], Step [2700/3188], Loss: 0.0004
Epoch [1/5], Step [3000/3188], Loss: 0.0035
Epoch [2/5], Step [300/3188], Loss: 0.0008
Epoch [2/5], Step [600/3188], Loss: 0.0002
Epoch [2/5], Step [900/3188], Loss: 0.0008
Epoch [2/5], Step [1200/3188], Loss: 0.0031
Epoch [2/5], Step [1500/3188], Loss: 0.0187
Epoch [2/5], Step [1800/3188], Loss: 0.0032
Epoch [2/5], Step [2100/3188], Loss: 0.0001
Epoch [2/5], Step [2400/3188], Loss: 0.0000
Epoch [2/5], Step [2700/3188], Loss: 0.0003
Epoch [2/5], Step [3000/3188], Loss: 0.0027
Epoch [3/5], Step [300/3188], Loss: 0.0008
Epoch [3/5], Step [600/3188], Loss: 0.0001
Epoch [3/5], Step [900/3188], Loss: 0.00

In [None]:
test_df.shape
last_train_data = X_train.iloc[-24:]
last_train_data

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend,전력소비량(kWh)
203976,100,22.1,0.0,0.6,93.0,8,24,0,2,0,457.68
203977,100,21.2,0.0,0.0,99.0,8,24,1,2,0,406.56
203978,100,20.9,0.0,0.2,98.0,8,24,2,2,0,392.64
203979,100,21.5,0.0,1.2,92.0,8,24,3,2,0,378.48
203980,100,21.5,0.0,1.8,92.0,8,24,4,2,0,379.2
203981,100,21.7,0.0,2.4,91.0,8,24,5,2,0,418.32
203982,100,22.1,0.0,1.9,95.0,8,24,6,2,0,588.96
203983,100,22.2,0.0,1.3,95.0,8,24,7,2,0,805.44
203984,100,23.0,0.0,1.5,90.0,8,24,8,2,0,917.04
203985,100,23.4,0.0,1.1,87.0,8,24,9,2,0,1014.0


In [None]:
test_df.columns

Index(['건물번호', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', 'month', 'day', 'hour',
       'weekday', 'weekend'],
      dtype='object')

In [None]:
last_train_data = X_train.iloc[-24:]


In [None]:
test_df = pd.concat((test_df, pd.DataFrame(np.zeros(test_df.shape[0]))), axis=1)
test_df.rename({0:"전력소비량(kWh)"}, axis=1)

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend,전력소비량(kWh)
0,1,23.5,0.0,2.2,72,8,25,0,3,0,0.0
1,1,23.0,0.0,0.9,72,8,25,1,3,0,0.0
2,1,22.7,0.0,1.5,75,8,25,2,3,0,0.0
3,1,22.1,0.0,1.3,78,8,25,3,3,0,0.0
4,1,21.8,0.0,1.0,77,8,25,4,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
16795,100,22.5,0.0,0.9,84,8,31,19,2,0,0.0
16796,100,20.7,0.0,0.4,95,8,31,20,2,0,0.0
16797,100,20.2,0.0,0.4,98,8,31,21,2,0,0.0
16798,100,20.1,0.0,1.1,97,8,31,22,2,0,0.0


In [None]:
test_df = test_df.rename({0:"전력소비량(kWh)"}, axis=1)

array([[1.        , 0.44444444, 0.        , ..., 0.        , 0.33333333,
        0.        ],
       [1.        , 0.41111111, 0.        , ..., 0.04347826, 0.33333333,
        0.        ],
       [1.        , 0.4       , 0.        , ..., 0.08695652, 0.33333333,
        0.        ],
       ...,
       [1.        , 0.37407407, 0.        , ..., 0.91304348, 0.33333333,
        0.        ],
       [1.        , 0.37037037, 0.        , ..., 0.95652174, 0.33333333,
        0.        ],
       [1.        , 0.33333333, 0.        , ..., 1.        , 0.33333333,
        0.        ]])

In [None]:
test_df = pd.concat((last_train_data, test_df)).reset_index(drop=True)
test_data = scaler.transform(test_df.values)
test_data.shape
# final_df = final_df.rename({})

(16824, 11)

In [None]:
test_dataset = TimeSeriesDataset(test_data, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()
test_predictions = []

with torch.no_grad():
  for i in range(test_data.shape[0] - window_size):
    x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
    new_x = model(x.view(1, window_size, -1))

    test_data[i+window_size, -1] = new_x
    test_predictions.append(new_x.detach().cpu().numpy().item())

In [None]:
test_predictions[:5]

[0.02060820162296295,
 0.07395455986261368,
 0.10649524629116058,
 0.1397680640220642,
 0.17641153931617737]

In [None]:
predictions = scaler.inverse_transform(test_data)[24:,-1]
predictions[25:45]

array([4891.48488663, 5187.95558441, 5604.23366503, 6128.64766685,
       6673.29815248, 7184.59021065, 7577.83984798, 7859.56454842,
       8079.16121917, 8256.86748346, 8341.92900475, 8392.65599521,
       8359.97590262, 8279.31710125, 8113.13721242, 7909.275176  ,
       7655.96989596, 7332.40340433, 6951.40101569, 6551.04063625])

In [None]:
sample_submission['answer'] = predictions
sample_submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,525.270086
1,1_20220825 01,1884.983404
2,1_20220825 02,2714.393436
3,1_20220825 03,3562.464323
4,1_20220825 04,4496.447879


In [None]:
sample_submission.to_csv(path+ '/lstm_baseline_submission_csv', index=False)

In [None]:
# 학습 데이터에서 마지막 행 가져오기
last_train_data = X_train.iloc[-24:]

# 실수형 데이터로 변환
# test_df['습도(%)'] = test_df['습도(%)'].astype('float64')

# 전력소비량 열 생성
final_df = pd.concat(test_df)
final_df = final_df.rename({0:'전력소비량(kWh)'},axis=1)

TypeError: ignored

In [None]:
last_train_data = X_train.iloc[-24:]
final_df = pd.concat((last_train_data, final_df)).reset_index(drop=True)

test_data = scaler.transform(test_df.values) # train과 동일하게 scaling
test_data.shape

In [None]:
# normalization
scaler = MinMaxScaler()
test_data = scaler.fit_transform(test_df.values)
test_loader = create_data_loader(test_data, window_size, batch_size)

# test_data = scaler.transform(X_test.values)
# test_data.shape

In [None]:
test_data.shape[0]

NameError: ignored

In [None]:
test_data.shape

In [None]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))

        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

In [None]:
predictions = scaler.inverse_transform(test_data)[:,-1] # 원래 스케일로 복구

In [None]:
predictions.shape

In [None]:
predictions.sum()

In [None]:
sample_submission['answer'] = predictions
sample_submission.head()

In [None]:
sample_submission.to_csv('./baseline_submission.csv', index=False)

# Random Forest

In [None]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [None]:
preds = model.predict(test_x)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission

In [None]:
submission['answer'] = preds
submission

In [None]:
submission.to_csv('./baseline_submission.csv', index=False)