## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [4]:
train_df = pd.read_csv('./train.csv')
building_info = pd.read_csv('./building_info.csv')
test_df = pd.read_csv('./test.csv')

## Train Data Pre-Processing

* train_df에 building_info 데이터 병합
* 일조 / 일사 drop 안함 => test에는 원래 일조 / 일사 없어서 실패

### column명 변경

In [5]:
train_df = train_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [6]:
test_df = test_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

In [7]:
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

In [8]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)

### CSV 파일병합

In [9]:
train_df = pd.merge(train_df, building_info, on='building_number', how='left')
test_df = pd.merge(test_df, building_info, on='building_number', how='left')

### 결측치 확인

In [10]:
# 전체 data 갯수
len(train_df)

204000

In [11]:
# train_df 열별 결측치
train_df.isna().sum()

num_date_time                0
building_number              0
date_time                    0
temperature                  0
rainfall                160069
windspeed                   19
humidity                     9
sunshine                 75182
solar_radiation          87913
power_consumption            0
building_type                0
total_area                   0
cooling_area                 0
solar_power_capacity         0
ess_capacity                 0
pcs_capacity                 0
dtype: int64

In [12]:
# test_df 열별 결측치
test_df.isna().sum()

num_date_time           0
building_number         0
date_time               0
temperature             0
rainfall                0
windspeed               0
humidity                0
building_type           0
total_area              0
cooling_area            0
solar_power_capacity    0
ess_capacity            0
pcs_capacity            0
dtype: int64

In [13]:
# solar_power_capacity, ess_capacity, pcs_capacity은 결측치가 -로 되어있음
print('태양광용량(kW)\t', len(train_df[train_df['solar_power_capacity'] == '-']))
print('ESS저장용량(kWh)\t', len(train_df[train_df['ess_capacity'] == '-']))
print('PCS용량(kW)\t', len(train_df[train_df['pcs_capacity'] == '-']))

태양광용량(kW)	 130560
ESS저장용량(kWh)	 193800
PCS용량(kW)	 193800


In [14]:
train_df = train_df.replace({'solar_power_capacity':'-'}, 0)
train_df = train_df.replace({'ess_capacity':'-'}, 0)
train_df = train_df.replace({'pcs_capacity':'-'}, 0)
train_df.iloc[:,8:11]

Unnamed: 0,solar_radiation,power_consumption,building_type
0,,1085.28,Other Buildings
1,,1047.36,Other Buildings
2,,974.88,Other Buildings
3,,953.76,Other Buildings
4,,986.40,Other Buildings
...,...,...,...
203995,,881.04,Hotel and Resort
203996,,798.96,Hotel and Resort
203997,,825.12,Hotel and Resort
203998,,640.08,Hotel and Resort


In [17]:
test_df = test_df.replace({'solar_power_capacity':'-'}, 0)
test_df = test_df.replace({'ess_capacity':'-'}, 0)
test_df = test_df.replace({'pcs_capacity':'-'}, 0)
test_df.iloc[:,10:13]

Unnamed: 0,solar_power_capacity,ess_capacity,pcs_capacity
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
16795,0,0,0
16796,0,0,0
16797,0,0,0
16798,0,0,0


In [18]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

### datetime을 (년,월,일,시)로 쪼갬

In [19]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['date_time'] = pd.to_datetime(train_df['date_time'], format='%Y%m%d %H')
test_df['date_time'] = pd.to_datetime(test_df['date_time'], format='%Y%m%d %H')

# date time feature 생성
train_df['hour'] = train_df['date_time'].dt.hour
train_df['day'] = train_df['date_time'].dt.day
train_df['month'] = train_df['date_time'].dt.month
train_df['year'] = train_df['date_time'].dt.year

test_df['hour'] = test_df['date_time'].dt.hour
test_df['day'] = test_df['date_time'].dt.day
test_df['month'] = test_df['date_time'].dt.month
test_df['year'] = test_df['date_time'].dt.year

## Regression Model Fit

In [20]:
! pip install xgboost



In [21]:
#train_df['building_type'] = train_df['building_type'].astype(int)
train_df['solar_power_capacity'] = train_df['solar_power_capacity'].astype(float)
train_df['ess_capacity'] = train_df['ess_capacity'].astype(float)
train_df['pcs_capacity'] = train_df['pcs_capacity'].astype(float)

In [22]:
test_df['solar_power_capacity'] = test_df['solar_power_capacity'].astype(float)
test_df['ess_capacity'] = test_df['ess_capacity'].astype(float)
test_df['pcs_capacity'] = test_df['pcs_capacity'].astype(float)

In [24]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   num_date_time         204000 non-null  object        
 1   building_number       204000 non-null  int64         
 2   date_time             204000 non-null  datetime64[ns]
 3   temperature           204000 non-null  float64       
 4   rainfall              204000 non-null  float64       
 5   windspeed             204000 non-null  float64       
 6   humidity              204000 non-null  float64       
 7   sunshine              204000 non-null  float64       
 8   solar_radiation       204000 non-null  float64       
 9   power_consumption     204000 non-null  float64       
 10  building_type         204000 non-null  object        
 11  total_area            204000 non-null  float64       
 12  cooling_area          204000 non-null  float64       
 13 

In [25]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16800 entries, 0 to 16799
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   num_date_time         16800 non-null  object        
 1   building_number       16800 non-null  int64         
 2   date_time             16800 non-null  datetime64[ns]
 3   temperature           16800 non-null  float64       
 4   rainfall              16800 non-null  float64       
 5   windspeed             16800 non-null  float64       
 6   humidity              16800 non-null  int64         
 7   building_type         16800 non-null  object        
 8   total_area            16800 non-null  float64       
 9   cooling_area          16800 non-null  float64       
 10  solar_power_capacity  16800 non-null  float64       
 11  ess_capacity          16800 non-null  float64       
 12  pcs_capacity          16800 non-null  float64       
 13  hour            

In [26]:
#train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
#train_y = train_df['전력소비량(kWh)']
#train_x = train_df.drop(columns=['num_date_time', 'date_time', 'sunshine', 'solar_radiation', 'power_consumption', 'building_type'])
train_x = train_df.drop(columns=['num_date_time', 'date_time', 'power_consumption', 'building_type'])
train_y = train_df['power_consumption']


test_df.drop('num_date_time', axis = 1, inplace=True)
test_df.drop('date_time', axis = 1, inplace=True)
test_df.drop('building_type', axis = 1, inplace=True)

In [27]:
train_x

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,hour,day,month,year
0,1,18.6,0.0,0.9,42.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,0,1,6,2022
1,1,18.0,0.0,1.1,45.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,1,1,6,2022
2,1,17.7,0.0,1.5,45.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,2,1,6,2022
3,1,16.7,0.0,1.4,48.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,3,1,6,2022
4,1,18.4,0.0,2.8,43.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,4,1,6,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,0.5,0.0,57497.84,40035.23,0.0,0.0,0.0,19,24,8,2022
203996,100,22.4,0.0,1.3,86.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,20,24,8,2022
203997,100,21.3,0.0,1.0,92.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,21,24,8,2022
203998,100,21.0,0.0,0.3,94.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,22,24,8,2022


In [28]:
train_y.head()

0    1085.28
1    1047.36
2     974.88
3     953.76
4     986.40
Name: power_consumption, dtype: float64

In [29]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   building_number       204000 non-null  int64  
 1   temperature           204000 non-null  float64
 2   rainfall              204000 non-null  float64
 3   windspeed             204000 non-null  float64
 4   humidity              204000 non-null  float64
 5   sunshine              204000 non-null  float64
 6   solar_radiation       204000 non-null  float64
 7   total_area            204000 non-null  float64
 8   cooling_area          204000 non-null  float64
 9   solar_power_capacity  204000 non-null  float64
 10  ess_capacity          204000 non-null  float64
 11  pcs_capacity          204000 non-null  float64
 12  hour                  204000 non-null  int32  
 13  day                   204000 non-null  int32  
 14  month                 204000 non-null  int32  
 15  

In [30]:
train_y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 204000 entries, 0 to 203999
Series name: power_consumption
Non-Null Count   Dtype  
--------------   -----  
204000 non-null  float64
dtypes: float64(1)
memory usage: 1.6 MB


In [31]:
#from xgboost import XGBClassifier
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(train_x, train_y, eval_metric='rmse')
#model.fit(train_x, y_train, eval_metric='rmse')

## Test Data Pre-Processing

In [None]:
#test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
#test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
#test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [None]:
#test_x = test_df.drop(columns=['num_date_time', '일시'])

In [34]:
test_df

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,hour,day,month,year
0,1,23.5,0.0,2.2,72,110634.00,39570.00,0.0,0.0,0.0,0,25,8,2022
1,1,23.0,0.0,0.9,72,110634.00,39570.00,0.0,0.0,0.0,1,25,8,2022
2,1,22.7,0.0,1.5,75,110634.00,39570.00,0.0,0.0,0.0,2,25,8,2022
3,1,22.1,0.0,1.3,78,110634.00,39570.00,0.0,0.0,0.0,3,25,8,2022
4,1,21.8,0.0,1.0,77,110634.00,39570.00,0.0,0.0,0.0,4,25,8,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,22.5,0.0,0.9,84,57497.84,40035.23,0.0,0.0,0.0,19,31,8,2022
16796,100,20.7,0.0,0.4,95,57497.84,40035.23,0.0,0.0,0.0,20,31,8,2022
16797,100,20.2,0.0,0.4,98,57497.84,40035.23,0.0,0.0,0.0,21,31,8,2022
16798,100,20.1,0.0,1.1,97,57497.84,40035.23,0.0,0.0,0.0,22,31,8,2022


In [37]:
train_x

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,hour,day,month,year
0,1,18.6,0.0,0.9,42.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,0,1,6,2022
1,1,18.0,0.0,1.1,45.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,1,1,6,2022
2,1,17.7,0.0,1.5,45.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,2,1,6,2022
3,1,16.7,0.0,1.4,48.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,3,1,6,2022
4,1,18.4,0.0,2.8,43.0,0.0,0.0,110634.00,39570.00,0.0,0.0,0.0,4,1,6,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,0.5,0.0,57497.84,40035.23,0.0,0.0,0.0,19,24,8,2022
203996,100,22.4,0.0,1.3,86.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,20,24,8,2022
203997,100,21.3,0.0,1.0,92.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,21,24,8,2022
203998,100,21.0,0.0,0.3,94.0,0.0,0.0,57497.84,40035.23,0.0,0.0,0.0,22,24,8,2022


In [32]:
test_x = test_df

## Inference

In [33]:
preds = model.predict(test_x)

ValueError: feature_names mismatch: ['building_number', 'temperature', 'rainfall', 'windspeed', 'humidity', 'sunshine', 'solar_radiation', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'hour', 'day', 'month', 'year'] ['building_number', 'temperature', 'rainfall', 'windspeed', 'humidity', 'total_area', 'cooling_area', 'solar_power_capacity', 'ess_capacity', 'pcs_capacity', 'hour', 'day', 'month', 'year']
expected sunshine, solar_radiation in input data

## Submission

In [37]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [38]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1902.357422
1,1_20220825 01,1799.604858
2,1_20220825 02,1711.548218
3,1_20220825 03,1642.777222
4,1_20220825 04,1642.777222
...,...,...
16795,100_20220831 19,1121.414062
16796,100_20220831 20,996.681335
16797,100_20220831 21,801.680969
16798,100_20220831 22,668.003418


In [39]:
submission.to_csv('./xgboost_v_1_2_submission.csv', index=False)