## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

## Train Data Pre-Processing

In [5]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)

In [6]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [7]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train_df['전력소비량(kWh)']

In [10]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,0.0,0.0,986.4,6,1,4


In [11]:
train_x.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1,18.6,0.0,0.9,42.0,6,1,0
1,1,18.0,0.0,1.1,45.0,6,1,1
2,1,17.7,0.0,1.5,45.0,6,1,2
3,1,16.7,0.0,1.4,48.0,6,1,3
4,1,18.4,0.0,2.8,43.0,6,1,4


In [12]:
train_y.head()

0    1085.28
1    1047.36
2     974.88
3     953.76
4     986.40
Name: 전력소비량(kWh), dtype: float64

## y label Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(train_y)

In [24]:
y_train

array([9467, 9024, 8182, ..., 6435, 4409, 3264], dtype=int64)

In [22]:
type(y_train)

numpy.ndarray

In [25]:
y_train = pd.Series(y_train)

In [26]:
type(y_train)

pandas.core.series.Series

In [27]:
y_train.head()

0    9467
1    9024
2    8182
3    7920
4    8320
dtype: int64

In [23]:
type(train_y)

pandas.core.series.Series

## Regression Model Fit

In [13]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 330.3 kB/s eta 0:05:02
   ---------------------------------------- 0.1/99.8 MB 491.5 kB/s eta 0:03:23
   ---------------------------------------- 0.2/99.8 MB 833.5 kB/s eta 0:02:00
   ---------------------------------------- 0.2/99.8 MB 1.0 MB/s eta 0:01:37
   ---------------------------------------- 0.3/99.8 MB 1.2 MB/s eta 0:01:27
   ---------------------------------------- 0.4/99.8 MB 1.3 MB/s eta 0:01:18
   ---------------------------------------- 0.5/99.8 MB 1.4 MB/s eta 0:01:12
   ---------------------------------------- 0.6/99.8 MB 1.5 MB/s eta 0:01:06
   ------------------

In [29]:
#from xgboost import XGBClassifier
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(train_x, train_y, eval_metric='rmse')
#model.fit(train_x, y_train, eval_metric='rmse')

In [7]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

## Test Data Pre-Processing

In [30]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [31]:
test_x = test_df.drop(columns=['num_date_time', '일시'])

## Inference

In [32]:
preds = model.predict(test_x)

## Submission

In [33]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [34]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2090.551758
1,1_20220825 01,1914.520142
2,1_20220825 02,1980.594604
3,1_20220825 03,1962.239624
4,1_20220825 04,1805.864990
...,...,...
16795,100_20220831 19,1120.587280
16796,100_20220831 20,974.620361
16797,100_20220831 21,801.317139
16798,100_20220831 22,700.727295


In [36]:
submission.to_csv('./xgboost_v_1_0_submission.csv', index=False)