<a href="https://colab.research.google.com/github/JHyuk2/DACON/blob/main/DACON_%EC%A0%84%EB%A0%A5%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/myDrive')

Mounted at /content/myDrive


In [6]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings(action='ignore')

In [105]:
path = '/content/myDrive/MyDrive/data/datasets/전력예측/'
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')
building_info_df = pd.read_csv(path +'building_info.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


In [8]:
# seed 고정하기.
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [9]:
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [10]:
# 제출 데이터 : num_date_time(건물+일시) 을 입력받고 예측하기.
sample_submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0


In [11]:
building_info_df.head()

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(C)          204000 non-null  float64
 4   강수량(mm)        43931 non-null   float64
 5   풍속(m/s)        203981 non-null  float64
 6   습도(%)          203991 non-null  float64
 7   일조(hr)         128818 non-null  float64
 8   일사(MJ/m2)      116087 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


## 1. 결측값 처리

In [113]:
# 결측값 처리 및 시간 columns 추가
def preprocessing(df):
  import datetime
  # 강수량에 대해서는 0값으로 대치
  df['강수량(mm)'].fillna(0,inplace=True)
  # 나머지에 대해서는 보간법을 적용하여 처리해버림.
  df = df.interpolate(method='values')
  # 그 외 결측값을 이전값으로 대치한다.
  df.fillna(method='pad', inplace=True)

  # 월, 일, 시간을 나누어줌
  df['month'] = df['일시'].apply(lambda x: int(x[4:6]))
  df['day'] = df['일시'].apply(lambda x: int(x[6:8]))
  df['hour'] = df['일시'].apply(lambda x: int(x[9:11]))

  # 평일과 주말을 구분하기 위해 weekday, weekend 설정.
  df['weekday'] = df['일시'].apply(lambda x: datetime.date(2022, int(x[4:6]), int(x[6:8])).weekday())
  df['weekend'] = df['weekday'].apply(lambda x: 0 if x <= 4 else 1)

  return df

In [114]:
test_df = preprocessing(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   num_date_time  16800 non-null  object 
 1   건물번호           16800 non-null  int64  
 2   일시             16800 non-null  object 
 3   기온(C)          16800 non-null  float64
 4   강수량(mm)        16800 non-null  float64
 5   풍속(m/s)        16800 non-null  float64
 6   습도(%)          16800 non-null  int64  
 7   month          16800 non-null  int64  
 8   day            16800 non-null  int64  
 9   hour           16800 non-null  int64  
 10  weekday        16800 non-null  int64  
 11  weekend        16800 non-null  int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 1.5+ MB


In [14]:
# 기상데이터는 전력수요랑 연관이 있다.
# 일조량 강수량 일사량 결측값이 너무 많다.

train_df.isna().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
일조(hr)            75182
일사(MJ/m2)         87913
전력소비량(kWh)            0
dtype: int64

In [16]:
# 강수량에 대해서는 0값으로 대치
train_df['강수량(mm)'].fillna(0,inplace=True)
# 나머지에 대해서는 보간법을 적용하여 처리해버림.
train_df = train_df.interpolate(method='values')
# 그 외 결측값을 이전값으로 대치한다.
train_df.fillna(method='pad', inplace=True)

train_df.isna().sum()

num_date_time        0
건물번호                 0
일시                   0
기온(C)                0
강수량(mm)              0
풍속(m/s)             19
습도(%)                9
일조(hr)           75182
일사(MJ/m2)        87913
전력소비량(kWh)           0
dtype: int64

## 2. 클러스터를 다음과 같이 나눌 수 있다.
- 공장 : 항상 잘 돌아감
- 사무실 : 평일 9 to 6
- 음식점 : 밤에 더 전력 수요가 많음
- 쉼터 : 주말에 더 전력을 많이 키는 곳

### 방법.
- K-means (silhouette, elbow method)  
- DBSCAN
- Hierarchical clustering



In [30]:
# 건물 한 개당 2040개의 시간 정보를 담고있음. (24 * 85일간의 정보)
train_df.건물번호.value_counts()

1      2040
64     2040
74     2040
73     2040
72     2040
       ... 
31     2040
30     2040
29     2040
28     2040
100    2040
Name: 건물번호, Length: 100, dtype: int64

In [31]:
# 월, 일, 시간을 나누어줌
import datetime
train_df['month'] = train_df['일시'].apply(lambda x: int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x: int(x[6:8]))
train_df['hour'] = train_df['일시'].apply(lambda x: int(x[9:11]))

# 평일과 주말을 구분하기 위해 weekday, weekend 설정.
train_df['weekday'] = train_df['일시'].apply(lambda x: datetime.date(2022, int(x[4:6]), int(x[6:8])).weekday())
train_df['weekend'] = train_df['weekday'].apply(lambda x: 0 if x <= 4 else 1)

In [32]:
train_df.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', 'month', 'day', 'hour', 'weekday',
       'weekend'],
      dtype='object')

In [36]:
# X_train = train_df[['건물번호', '기온(C)', '풍속(m/s)', '습도(%)','month','day','hour']]
X_train = train_df.drop(['num_date_time', '전력소비량(kWh)'], axis=1)
y_train = train_df['전력소비량(kWh)']
# X_test = test_df.drop(['num_date_time', '전력소비량(kWh)'], axis=1)
# y_test = test_df['전력소비량(kWh)']

In [99]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8,25,0,3,0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8,25,1,3,0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8,25,2,3,0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8,25,3,3,0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8,25,4,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84,8,31,19,2,0
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95,8,31,20,2,0
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98,8,31,21,2,0
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97,8,31,22,2,0


In [37]:
X_train.head(1)

Unnamed: 0,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),month,day,hour,weekday,weekend
0,1,20220601 00,18.6,0.0,0.9,42.0,0.0,0.0,6,1,0,2,0


In [38]:
y_train.head(1)

0    1085.28
Name: 전력소비량(kWh), dtype: float64

In [59]:
# loss function : SMAPE 정의
from sklearn.metrics import mean_absolute_error, make_scorer

def smape(true, pred):
    true = np.array(true)  # np.array로 바꿔야 에러 없음
    pred = np.array(pred)
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))  # *2 , *100은 상수이므로 생략

SMAPE = make_scorer(smape, greater_is_better=False)  # smape 값이 작아져야하므로 False

# LSTM

In [None]:
# 강수량 결측치 0.0으로 채우기
train_df['강수량(mm)'].fillna(0.0, inplace=True)

# 풍속, 습도 결측치 평균으로 채우고 반올림하기
train_df['풍속(m/s)'].fillna(round(train_df['풍속(m/s)'].mean(),2), inplace=True)
train_df['습도(%)'].fillna(round(train_df['습도(%)'].mean(),2), inplace=True)

In [None]:
# 순서 재배치
# train_df = train_df[train_df.columns[:7].to_list() + train_df.columns[8:].to_list() + train_df.columns[7:8].to_list()]
# train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,time,전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,6.0,1.0,0,0.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,6.0,1.0,1,1.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,6.0,1.0,2,2.0,974.88
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,6.0,1.0,3,3.0,953.76
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,6.0,1.0,4,4.0,986.4


In [90]:
# 하이퍼파라미터
input_size = 12  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

In [91]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [87]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(X_train.drop(['일시'], axis=1).values)
train_loader = create_data_loader(train_data, window_size, batch_size)

In [118]:
X_train.columns

Index(['건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)',
       '일사(MJ/m2)', 'month', 'day', 'hour', 'weekday', 'weekend'],
      dtype='object')

In [126]:
test_df['일사(MJ/m2)'] = X_train['일사(MJ/m2)']
test_df['일조(hr)'] = X_train['일조(hr)'][test_df.일시 == X_train.일시]

ValueError: ignored

In [123]:
test_df.shape

(16800, 14)

In [125]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,hour,weekday,weekend,일사(MJ/m2),일조(hr)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8,25,0,3,0,0.0,0.0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8,25,1,3,0,0.0,0.0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8,25,2,3,0,0.0,0.0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8,25,3,3,0,0.0,0.0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8,25,4,3,0,0.0,0.0


In [93]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [94]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
# criterion = SMAPE
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: cpu


In [95]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        # loss = SMAPE(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

Epoch [1/5], Step [300/3188], Loss: 0.0023
Epoch [1/5], Step [600/3188], Loss: 0.0013
Epoch [1/5], Step [900/3188], Loss: 0.0119
Epoch [1/5], Step [1200/3188], Loss: 0.0144
Epoch [1/5], Step [1500/3188], Loss: 0.0136
Epoch [1/5], Step [1800/3188], Loss: 0.0048
Epoch [1/5], Step [2100/3188], Loss: 0.0023
Epoch [1/5], Step [2400/3188], Loss: 0.0112
Epoch [1/5], Step [2700/3188], Loss: 0.0002
Epoch [1/5], Step [3000/3188], Loss: 0.0001
Epoch [2/5], Step [300/3188], Loss: 0.0000
Epoch [2/5], Step [600/3188], Loss: 0.0001
Epoch [2/5], Step [900/3188], Loss: 0.0000
Epoch [2/5], Step [1200/3188], Loss: 0.0000
Epoch [2/5], Step [1500/3188], Loss: 0.0001
Epoch [2/5], Step [1800/3188], Loss: 0.0001
Epoch [2/5], Step [2100/3188], Loss: 0.0001
Epoch [2/5], Step [2400/3188], Loss: 0.0002
Epoch [2/5], Step [2700/3188], Loss: 0.0002
Epoch [2/5], Step [3000/3188], Loss: 0.0000
Epoch [3/5], Step [300/3188], Loss: 0.0005
Epoch [3/5], Step [600/3188], Loss: 0.0002
Epoch [3/5], Step [900/3188], Loss: 0.00

In [115]:
# normalization
scaler = MinMaxScaler()
test_data = scaler.fit_transform(test_df.drop(['num_date_time', '일시'], axis=1).values)
test_loader = create_data_loader(test_data, window_size, batch_size)

# test_data = scaler.transform(X_test.values)
# test_data.shape

In [116]:
test_data.shape

(16800, 10)

In [None]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))

        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

# Random Forest

In [None]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [None]:
preds = model.predict(test_x)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission

In [None]:
submission['answer'] = preds
submission

In [None]:
submission.to_csv('./baseline_submission.csv', index=False)