# Environment

In [9]:
import numpy as np
import pandas as pd
import pickle
import os
import random
import torch
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:0


In [3]:
def seed_everything(seed: int=302):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [4]:
seed_everything()

In [6]:
# 미세먼지 관측소
with open('loc_match.pkl', 'rb') as f:
    loc_match = pickle.load(f)

cols = ['기온', '풍향', '풍속', '강수량', '습도']

26298


# Regression

In [32]:
submit = list()

train_split = int(len(pd.read_csv('data/공주train.csv'))*0.75)

for pm_loc in loc_match:
    print('>>>>>>>>>> {} 시작 >>>>>>>>>>'.format(pm_loc))
    # data 불러오기
    df_train = pd.read_csv('data/{}train.csv'.format(pm_loc))
    x = list()
    y = list()
    for index, row in df_train.iterrows():
        x.append(np.array(row[cols].values))
        y.append(np.array(row['PM2.5']))
    train_x = x[:train_split]
    train_y = y[:train_split]
    vali_x = x[train_split:]
    vali_y = y[train_split:]
    df_test = pd.read_csv('data/{}test.csv'.format(pm_loc))
    test_x = list()
    for index, row in df_test.iterrows():
        test_x.append(np.array(row[cols].values))
    
    train_x = np.array(train_x)
    vali_x = np.array(vali_x)
    test_x = np.array(test_x)
    
    # max_features = number of features
    # n_estimator 트리의 수 바꿔가며 tune
    # n_estimator 클수록 과대적합을 방지할 수 있다
    max_features = 5
    nTreeList = np.arange(50, 200, 10) # tree의 개수
    loss_list = list()
    
    # tree의 개수에 따른 성능 탐색
    for nTree in nTreeList:
        rfr = RandomForestRegressor(n_estimators=nTree, 
                                    max_features=max_features, 
                                    random_state=302)
        rfr.fit(train_x, train_y)
        preds = rfr.predict(vali_x)
        loss_list.append(mse(preds, vali_y))
    print('>>>>>>>>>> {} nTree 탐색 완료 >>>>>>>>>>'.format(pm_loc))
    
    # 가장 성능이 좋은 tree의 개수로 train 및 predict 진행
    nTree = nTreeList[loss_list.index(min(loss_list))]
    print('>>>>>>>>>> 선택된 트리의 개수: {} >>>>>>>>>>'.format(nTree))
    rfr = RandomForestRegressor(n_estimators=nTree, 
                                max_features=max_features, 
                                random_state=302)
    rfr.fit(x, y)
    preds = rfr.predict(test_x)
    preds = preds.flatten()
    
    n = 5
    while n <= 320:
        submit.append(preds[24*(n-3):24*n])
        n += 5
        
    print('>>>>>>>>>> {} 예측 완료 >>>>>>>>>>'.format(pm_loc))
    print('---------------------------------')
    print('현재 예측한 일수: {}'.format(np.array(submit).shape[0]*np.array(submit).shape[1]))
    print('---------------------------------')

>>>>>>>>>> 공주 시작 >>>>>>>>>>
>>>>>>>>>> 공주 nTree 탐색 완료 >>>>>>>>>>
>>>>>>>>>> 선택된 트리의 개수: 190 >>>>>>>>>>
>>>>>>>>>> 공주 예측 완료 >>>>>>>>>>
---------------------------------
현재 예측한 일수: 4608
---------------------------------
>>>>>>>>>> 노은동 시작 >>>>>>>>>>
>>>>>>>>>> 노은동 nTree 탐색 완료 >>>>>>>>>>
>>>>>>>>>> 선택된 트리의 개수: 190 >>>>>>>>>>
>>>>>>>>>> 노은동 예측 완료 >>>>>>>>>>
---------------------------------
현재 예측한 일수: 9216
---------------------------------
>>>>>>>>>> 논산 시작 >>>>>>>>>>
>>>>>>>>>> 논산 nTree 탐색 완료 >>>>>>>>>>
>>>>>>>>>> 선택된 트리의 개수: 170 >>>>>>>>>>
>>>>>>>>>> 논산 예측 완료 >>>>>>>>>>
---------------------------------
현재 예측한 일수: 13824
---------------------------------
>>>>>>>>>> 대천2동 시작 >>>>>>>>>>
>>>>>>>>>> 대천2동 nTree 탐색 완료 >>>>>>>>>>
>>>>>>>>>> 선택된 트리의 개수: 140 >>>>>>>>>>
>>>>>>>>>> 대천2동 예측 완료 >>>>>>>>>>
---------------------------------
현재 예측한 일수: 18432
---------------------------------
>>>>>>>>>> 독곶리 시작 >>>>>>>>>>
>>>>>>>>>> 독곶리 nTree 탐색 완료 >>>>>>>>>>
>>>>>>>>>> 선택된 트리의 개수: 190 >>>>>>>>>>
>>>>>>>>>> 독

# Submit

In [33]:
submit = np.array(submit).flatten()

In [34]:
answer_ensemble = pd.read_csv('../answer_sample.csv')
answer_ensemble['PM2.5'] = submit
answer_ensemble

Unnamed: 0,연도,일시,측정소,PM2.5
0,4,01-03 00:00,공주,0.071726
1,4,01-03 01:00,공주,0.080063
2,4,01-03 02:00,공주,0.067179
3,4,01-03 03:00,공주,0.063600
4,4,01-03 04:00,공주,0.059726
...,...,...,...,...
78331,4,11-16 19:00,홍성읍,0.111053
78332,4,11-16 20:00,홍성읍,0.101838
78333,4,11-16 21:00,홍성읍,0.116117
78334,4,11-16 22:00,홍성읍,0.116469


In [35]:
answer_ensemble.to_csv('answer_rfr.csv')

# Public Score
- 11.68964547