## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('./train.csv')
building_info = pd.read_csv('./building_info.csv')
test_df = pd.read_csv('./test.csv')

## Train Data Pre-Processing

* train_df에 building_info 데이터 병합

### column명 변경

In [4]:
train_df = train_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [5]:
test_df = test_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [6]:
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

In [7]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)

### CSV 파일병합

In [8]:
train_df = pd.merge(train_df, building_info, on='building_number', how='left')
test_df = pd.merge(test_df, building_info, on='building_number', how='left')

### 결측치 확인

In [9]:
# 전체 data 갯수
len(train_df)

204000

In [10]:
# train_df 열별 결측치
train_df.isna().sum()

num_date_time                0
building_number              0
date_time                    0
temperature                  0
rainfall                160069
windspeed                   19
humidity                     9
sunshine                 75182
solar_radiation          87913
power_consumption            0
building_type                0
total_area                   0
cooling_area                 0
solar_power_capacity         0
ess_capacity                 0
pcs_capacity                 0
dtype: int64

In [11]:
# test_df 열별 결측치
test_df.isna().sum()

num_date_time           0
building_number         0
date_time               0
temperature             0
rainfall                0
windspeed               0
humidity                0
building_type           0
total_area              0
cooling_area            0
solar_power_capacity    0
ess_capacity            0
pcs_capacity            0
dtype: int64

In [12]:
# solar_power_capacity, ess_capacity, pcs_capacity은 결측치가 -로 되어있음
print('태양광용량(kW)\t', len(train_df[train_df['solar_power_capacity'] == '-']))
print('ESS저장용량(kWh)\t', len(train_df[train_df['ess_capacity'] == '-']))
print('PCS용량(kW)\t', len(train_df[train_df['pcs_capacity'] == '-']))

태양광용량(kW)	 130560
ESS저장용량(kWh)	 193800
PCS용량(kW)	 193800


In [13]:
train_df

Unnamed: 0,num_date_time,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28,Other Buildings,110634.00,39570.00,-,-,-
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36,Other Buildings,110634.00,39570.00,-,-,-
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88,Other Buildings,110634.00,39570.00,-,-,-
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76,Other Buildings,110634.00,39570.00,-,-,-
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40,Other Buildings,110634.00,39570.00,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,57497.84,40035.23,-,-,-
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,57497.84,40035.23,-,-,-
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12,Hotel and Resort,57497.84,40035.23,-,-,-
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08,Hotel and Resort,57497.84,40035.23,-,-,-


In [15]:
from dataprep.eda import plot, plot_missing
plot_missing(train_df['sunshine'])

AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

### datetime을 (년,월,일,시)로 쪼갬

In [None]:
train_df['date_time'] = pd.to_datetime(train_df['date_time'], format='%Y%m%d %H')
test_df['date_time'] = pd.to_datetime(test_df['date_time'], format='%Y%m%d %H')

# date time feature 생성
train_df['hour'] = train_df['date_time'].dt.hour
train_df['day'] = train_df['date_time'].dt.day
train_df['month'] = train_df['date_time'].dt.month
train_df['year'] = train_df['date_time'].dt.year

test_df['hour'] = test_df['date_time'].dt.hour
test_df['day'] = test_df['date_time'].dt.day
test_df['month'] = test_df['date_time'].dt.month
test_df['year'] = test_df['date_time'].dt.year

In [None]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
#train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
#train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
#train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [None]:
train_df.head()

In [None]:
test_df.head()

## Regression Model Fit

In [None]:
! pip install xgboost

In [None]:
train_df = train_df.replace({'solar_power_capacity':'-'}, 0)
train_df = train_df.replace({'ess_capacity':'-'}, 0)
train_df = train_df.replace({'pcs_capacity':'-'}, 0)
train_df.iloc[:,8:11]

In [None]:
test_df = test_df.replace({'solar_power_capacity':'-'}, 0)
test_df = test_df.replace({'ess_capacity':'-'}, 0)
test_df = test_df.replace({'pcs_capacity':'-'}, 0)
test_df.iloc[:,8:11]

In [None]:
#train_df['building_type'] = train_df['building_type'].astype(int)
train_df['solar_power_capacity'] = train_df['solar_power_capacity'].astype(float)
train_df['ess_capacity'] = train_df['ess_capacity'].astype(float)
train_df['pcs_capacity'] = train_df['pcs_capacity'].astype(float)

In [None]:
test_df['solar_power_capacity'] = test_df['solar_power_capacity'].astype(float)
test_df['ess_capacity'] = test_df['ess_capacity'].astype(float)
test_df['pcs_capacity'] = test_df['pcs_capacity'].astype(float)

In [None]:
#train_df.drop('building_type', axis = 1, inplace=True)
#test_df.drop('building_type', axis = 1, inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
#train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
#train_y = train_df['전력소비량(kWh)']
train_x = train_df.drop(columns=['num_date_time', 'date_time', 'sunshine', 'solar_radiation', 'power_consumption', 'building_type'])
#train_df.drop('num_date_time', axis = 1, inplace=True)
#train_df.drop('date_time', axis = 1, inplace=True)
#train_df.drop('sunshine', axis = 1, inplace=True)
#train_df.drop('solar_radiation', axis = 1, inplace=True)
#train_df.drop('building_type', axis = 1, inplace=True)
train_y = train_df['power_consumption']
#train_df.drop('power_consumption', axis = 1, inplace=True)

test_df.drop('num_date_time', axis = 1, inplace=True)
test_df.drop('date_time', axis = 1, inplace=True)
test_df.drop('building_type', axis = 1, inplace=True)

In [None]:
train_x

In [None]:
#from xgboost import XGBClassifier
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(train_x, train_y, eval_metric='rmse')
#model.fit(train_x, y_train, eval_metric='rmse')

In [None]:
test_df

In [None]:
train_df

In [None]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

## Test Data Pre-Processing

In [None]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [None]:
test_x = test_df.drop(columns=['num_date_time', '일시'])

In [None]:
test_x = test_df

## Inference

In [None]:
preds = model.predict(test_x)

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission

In [None]:
submission['answer'] = preds
submission

In [None]:
submission.to_csv('./xgboost_v_1_1_submission.csv', index=False)