# 라이브러리

In [None]:
!pip install scipy
!pip install sklearn
!pip install xgboost
!pip install optuna

In [None]:
import pandas as pd
import numpy as np

from scipy import interpolate

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
from xgboost import XGBRegressor

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

import joblib

In [None]:
import warnings
warnings.filterwarnings('ignore')

##### 파이썬 및 패키지 버전

In [None]:
import sys
print('Python version :', sys.version)
print('pandas version :', pd.__version__)
print('numpy version :', np.__version__)
print('xgboost version :', xgb.__version__)

Python version : 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
pandas version : 1.5.3
numpy version : 1.23.5
xgboost version : 1.7.6


# 데이터 전처리

In [None]:
q1_data = pd.read_csv('Q1_train.csv')
q1_test = pd.read_csv('Q1_test.csv', index_col = 0)
q1_data.head()

#### dtype 맞게 전처리

datetime : object &rightarrow; datetime

In [None]:
q1_data['datetime'] = pd.to_datetime(q1_data['datetime'])
q1_test['datetime'] = pd.to_datetime(q1_test['datetime'])

#### ratio 변수 통일 + 변수 drop

연결 성공한 횟수 &rightarrow; 연결 성공 백분율로 변경

In [None]:
# 연결 성공한 횟수 -> 연결 성공 백분율로 변경
q1_data['erabaddratio'] = (q1_data['erabaddsucc'] / q1_data['erabaddatt']) * 100
q1_data['endcaddratio'] = (q1_data['endcaddsucc'] / q1_data['endcaddatt']) * 100
q1_data['endcmodbymenbratio'] = (q1_data['endcmodbymenbsucc'] / q1_data['endcmodbymenbatt']) * 100
q1_data['endcmodbysgnbratio'] = (q1_data['endcmodbysgnbsucc'] / q1_data['endcmodbysgnbatt']) * 100
q1_data['connestabratio'] = (q1_data['connestabsucc'] / q1_data['connestabatt']) * 100
q1_data['handoverratio'] = (q1_data['handoversucc'] / q1_data['handoveratt']) * 100
q1_data['reestabratio'] = (q1_data['reestabsucc'] / q1_data['reestabatt']) * 100

In [None]:
# 연결 성공한 횟수 -> 연결 성공 백분율로 변경
q1_test['erabaddratio'] = (q1_test['erabaddsucc'] / q1_test['erabaddatt']) * 100
q1_test['endcaddratio'] = (q1_test['endcaddsucc'] / q1_test['endcaddatt']) * 100
q1_test['endcmodbymenbratio'] = (q1_test['endcmodbymenbsucc'] / q1_test['endcmodbymenbatt']) * 100
q1_test['endcmodbysgnbratio'] = (q1_test['endcmodbysgnbsucc'] / q1_test['endcmodbysgnbatt']) * 100
q1_test['connestabratio'] = (q1_test['connestabsucc'] / q1_test['connestabatt']) * 100
q1_test['handoverratio'] = (q1_test['handoversucc'] / q1_test['handoveratt']) * 100
q1_test['reestabratio'] = (q1_test['reestabsucc'] / q1_test['reestabatt']) * 100

In [None]:
# 연결 시도한 횟수가 0이어서 성공 백분율이 nan이 된 값들을 0으로 바꿔주기
ratios = ['erabaddratio', 'endcaddratio', 'endcmodbymenbratio', 'endcmodbysgnbratio', 'connestabratio', 'handoverratio', 'reestabratio']
for i in ratios :
  q1_data[i] = q1_data[i].replace(np.nan, 0)

In [None]:
# 연결 시도한 횟수가 0이어서 성공 백분율이 nan이 된 값들을 0으로 바꿔주기
for i in ratios :
  q1_test[i] = q1_test[i].replace(np.nan, 0)

In [None]:
list = ['erabaddsucc', 'endcaddsucc', 'endcmodbymenbsucc', 'endcmodbysgnbsucc', 'connestabsucc', 'handoversucc', 'reestabsucc']
q1_data = q1_data.drop(list, axis = 1)

In [None]:
q1_test = q1_test.drop(list, axis = 1)

In [None]:
# 순서 재배치 + scgfail, redirectiontolte_emergencyfallback, numrar drop
col = q1_data.columns.to_numpy()
col = col[[0,1,3,4,32,5,33,6,34,7,35,8,36,9,10,12,37,13,38,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30,31]]
q1_data = q1_data[col]

In [None]:
col = q1_test.columns.to_numpy()
col = col[[0,1,3,4,31,5,32,6,33,7,34,8,35,9,10,12,36,13,37,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30]]
q1_test = q1_test[col]

#### 결측치 채우기

선형보간법으로 결측값을 채워줌

In [None]:
q1_data.isna().sum()

In [None]:
# station E, F, G에는 값이 하나씩 적게 들어가 있음 -> 이것도 선형보간법으로 채워줌
q1_data['ru_id'].value_counts()

##### Station별로 분리

In [None]:
StationA = q1_data[q1_data['ru_id'] == 'BaseStationA']
StationC = q1_data[q1_data['ru_id'] == 'BaseStationC']
StationD = q1_data[q1_data['ru_id'] == 'BaseStationD']
StationE = q1_data[q1_data['ru_id'] == 'BaseStationE']
StationF = q1_data[q1_data['ru_id'] == 'BaseStationF']
StationG = q1_data[q1_data['ru_id'] == 'BaseStationG']
StationH = q1_data[q1_data['ru_id'] == 'BaseStationH']
StationI = q1_data[q1_data['ru_id'] == 'BaseStationI']

In [None]:
StationA.reset_index(drop = True, inplace = True)
StationC.reset_index(drop = True, inplace = True)
StationD.reset_index(drop = True, inplace = True)
StationE.reset_index(drop = True, inplace = True)
StationF.reset_index(drop = True, inplace = True)
StationG.reset_index(drop = True, inplace = True)
StationH.reset_index(drop = True, inplace = True)
StationI.reset_index(drop = True, inplace = True)

In [None]:
StationB = q1_test[q1_test['ru_id'] == 'BaseStationB']
StationJ = q1_test[q1_test['ru_id'] == 'BaseStationJ']

In [None]:
StationB.reset_index(drop = True, inplace = True)
StationJ.reset_index(drop = True, inplace = True)

##### 선형 보간법

In [None]:
missing_col = ['rlculbyte', 'rlcdlbyte', 'totprbulavg', 'totprbdlavg', 'dlreceivedriavg', 'dltransmittedmcsavg',
               'airmaculbyte', 'airmacdlbyte', 'bler_ul', 'bler_dl', 'rachpreamblea', 'nummsg3', 'attpaging',
               'rssipathavg', 'dlreceivedcqiavg']

In [None]:
for i in missing_col :
  StationE[i].interpolate(method = 'linear', inplace = True)
  StationF[i].interpolate(method = 'linear', inplace = True)
  StationG[i].interpolate(method = 'linear', inplace = True)

E, F, G 2023-06-28 00:55:00 값도 선형보간법으로 채워준 후, int 형태의 변수들이 float값을 가지지 않도록 반올림 해주기

In [None]:
int_list = ['erabaddatt', 'endcaddatt', 'endcmodbymenbatt', 'endcmodbysgnbatt', 'connestabatt', 'redirectiontolte_coverageout', 'redirectiontolte_epsfallback', 'handoveratt', 'reestabatt', 'endcrelbymenb', 'uenomax',
            'rachpreamblea', 'nummsg3', 'attpaging']

In [None]:
df1 = pd.DataFrame(['2023-06-28 00:55:00', 'BaseStationE'], index = ['datetime', 'ru_id']).transpose()
df2 = pd.DataFrame((StationE.iloc[11529, 2:] + StationE.iloc[11530, 2:]) / 2).transpose()
E_new = pd.concat([df1, df2], axis = 1)
E_new['datetime'] = pd.to_datetime(E_new['datetime'])
StationE = pd.concat([StationE.iloc[:11530], E_new, StationE.iloc[11530:]], ignore_index = True)

In [None]:
df1 = pd.DataFrame(['2023-06-28 00:55:00', 'BaseStationF'], index = ['datetime', 'ru_id']).transpose()
df2 = pd.DataFrame((StationF.iloc[11529, 2:] + StationF.iloc[11530, 2:]) / 2).transpose()
F_new = pd.concat([df1, df2], axis = 1)
F_new['datetime'] = pd.to_datetime(F_new['datetime'])
StationF = pd.concat([StationF.iloc[:11530], F_new, StationF.iloc[11530:]], ignore_index = True)

In [None]:
df1 = pd.DataFrame(['2023-06-28 00:55:00', 'BaseStationG'], index = ['datetime', 'ru_id']).transpose()
df2 = pd.DataFrame((StationG.iloc[11529, 2:] + StationG.iloc[11530, 2:]) / 2).transpose()
G_new = pd.concat([df1, df2], axis = 1)
G_new['datetime'] = pd.to_datetime(G_new['datetime'])
StationG = pd.concat([StationG.iloc[:11530], G_new, StationG.iloc[11530:]], ignore_index = True)

In [None]:
StationE[int_list] = StationE[int_list].round(decimals = 0)
StationE[int_list] = StationE[int_list].astype(int)

StationF[int_list] = StationF[int_list].round(decimals = 0)
StationF[int_list] = StationF[int_list].astype(int)

StationG[int_list] = StationG[int_list].round(decimals = 0)
StationG[int_list] = StationG[int_list].astype(int)

#### 전처리 완료한 데이터

In [None]:
train_df = pd.concat([StationA, StationC, StationD, StationE, StationF, StationG, StationH, StationI])
train_df.reset_index(drop = True, inplace = True)
test_df = q1_test

# 모델링

pycaret을 참고하여 XGBoost를 사용하기로 결정

## 모델 1.

전처리를 마친 데이터에 변수 추가 없이 모델링

In [64]:
# datetime과 ru_id 제거
data = train_df.iloc[:, 2:]

In [None]:
# X와 y로 분할
X, y = data.iloc[:, :-1], data.iloc[:, [-1]]

# train : valid = 7 : 3 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, shuffle = False, random_state = 30)
print(X_train.shape, X_valid.shape)

In [66]:
# scaling
ss = StandardScaler()

X_train_scale = ss.fit_transform(X_train)
X_valid_scale = ss.transform(X_valid)

#### Optuna를 이용하여 parameter 튜닝 진행(예시 코드)

In [None]:
'''
def objectiveXGB(trial: Trial, X_train, y_train, X_valid, y_valid):
    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 4000),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_discrete_uniform('gamma', 0.01, 0.5, 0.01),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'random_state': 1
    }

    model = XGBRegressor(**param)
    xgb_model = model.fit(X_train, y_train, verbose = False,
                          eval_metric = 'mae', eval_set = [(X_valid, y_valid)], early_stopping_rounds = 100)

    ## MAE로 Loss 계산
    score = mean_absolute_error(xgb_model.predict(X_valid), y_valid)

    return score
'''

In [None]:
'''
study = optuna.create_study(direction = 'minimize', sampler = TPESampler())

study.optimize(lambda trial : objectiveXGB(trial, X_train_scale, y_train, X_valid_scale, y_valid), n_trials = 100)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value, study.best_trial.params))
'''

In [None]:
'''
xgboost = XGBRegressor(random_state = 1,
                        **study.best_trial.params)
'''

#### 최적의 parameter로 모델링

In [None]:
params = {'n_estimators': 1492, 'max_depth': 15, 'min_child_weight': 2, 'gamma': 0.4,
          'learning_rate': 0.0038565758063650217, 'colsample_bytree': 0.7, 'subsample': 0.8,
          'lambda': 4.076587859420142, 'alpha': 0.5270409349856588}

In [None]:
xgboost = XGBRegressor(random_state = 1, tree_method = 'gpu_hist',
                        **params)

In [None]:
xgboost.fit(X_train_scale, y_train,
            eval_metric = 'mae', eval_set = [(X_valid_scale, y_valid)], early_stopping_rounds = 100,
            verbose = 100)

In [67]:
# validation set에 대한 예측값으로 계산한 mae
y_pred = xgboost.predict(X_valid_scale)
valid_mae = mean_absolute_error(y_valid, y_pred)
print('valid set에서의 mae :', valid_mae)

valid set에서의 mae : 0.4954013516641977


In [68]:
# 예측값을 반올림 한 뒤의 mae
y_pred = xgboost.predict(X_valid_scale)
y_pred_round = y_pred.round(0)
valid_mae2 = mean_absolute_error(y_valid, y_pred_round)
print('valid set에서의 mae(반올림 한 뒤) :', valid_mae2)

valid set에서의 mae(반올림 한 뒤) : 0.4384867224445253


In [None]:
# 모델 저장
joblib.dump(xgboost, 'model_1.pkl')

## 모델 2.

#### 시간, 공휴일 여부 변수 추가

In [69]:
train_df2 = train_df.copy()
test_df2 = test_df.copy()

In [70]:
date = pd.to_datetime(train_df2.datetime)
train_df2['hour'] = date.dt.hour
train_df2['day'] = date.dt.weekday

date2 = pd.to_datetime(test_df2.datetime)
test_df2['hour'] = date2.dt.hour
test_df2['day'] = date2.dt.weekday

In [71]:
# 공휴일 여부(공휴일이면 1, 아니면 0)
## 주말
train_df2['holiday'] = train_df2['day'].apply(lambda x: 1 if x >= 5 else 0)
test_df2['holiday'] = test_df2['day'].apply(lambda x: 1 if x >= 5 else 0)
## 공휴일(5/29, 6,6)
train_df2[('2023-05-29' <= train_df2.datetime) & (train_df2.datetime < '2023-05-30')]['holiday'] = 1
train_df2[('2023-06-06' <= train_df2.datetime) & (train_df2.datetime < '2023-06-07')]['holiday'] = 1
test_df2[('2023-05-29' <= test_df2.datetime) & (test_df2.datetime < '2023-05-30')]['holiday'] = 1
test_df2[('2023-06-06' <= test_df2.datetime) & (test_df2.datetime < '2023-06-07')]['holiday'] = 1

In [72]:
# 0시와 23시는 가까운 시간대이지만, 숫자 상으로는 멀게 받아들여질 수 있음
# 따라서 sin, cos 변환을 하여 시간의 순환적 성격을 반영하기로 함
train_df2['cos_time'] = np.cos(2*np.pi*(train_df2.hour/24))
train_df2['sin_time'] = np.sin(2*np.pi*(train_df2.hour/24))

test_df2['cos_time'] = np.cos(2*np.pi*(test_df2.hour/24))
test_df2['sin_time'] = np.sin(2*np.pi*(test_df2.hour/24))

In [73]:
train_df2.drop(['hour', 'day'], axis = 1, inplace = True)
test_df2.drop(['hour', 'day'], axis = 1, inplace = True)

In [74]:
# 변수 순서 바꿔주기
train_df2 = train_df2[train_df2.columns[:35].to_list() + train_df2.columns[36:].to_list() + train_df2.columns[35:36].to_list()]

In [None]:
train_df2.head()

In [75]:
# X와 y로 분할
X, y = train_df2.iloc[:, 2:-1], train_df2.iloc[:, [-1]]

# train : valid = 7 : 3 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, shuffle = False, random_state = 30)
print(X_train.shape, X_valid.shape)

(96213, 36) (41235, 36)


In [76]:
# scaling
ss2 = StandardScaler()

not_holiday = [col for col in X_train.columns if col not in ['hoilday']]

X_train_scale = ss2.fit_transform(X_train[not_holiday])
X_valid_scale = ss2.transform(X_valid[not_holiday])
print(X_train_scale.shape, X_valid_scale.shape)

(96213, 36) (41235, 36)


#### Optuna를 이용하여 parameter 튜닝 진행(예시 코드)

In [None]:
'''
def objectiveXGB(trial: Trial, X_train, y_train, X_valid, y_valid):
    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 4000),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_discrete_uniform('gamma', 0.01, 0.5, 0.01),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10),
        'nthread' : 1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'random_state': 1
    }

    model = XGBRegressor(**param)
    xgb_model = model.fit(X_train, y_train, verbose = False,
                          eval_metric = 'mae', eval_set = [(X_valid, y_valid)], early_stopping_rounds = 100)

    ## MAE로 Loss 계산
    score = mean_absolute_error(xgb_model.predict(X_valid), y_valid)

    return score
'''

In [None]:
'''
study2 = optuna.create_study(direction = 'minimize', sampler = TPESampler())

study2.optimize(lambda trial : objectiveXGB(trial, X_train_scale, y_train, X_valid_scale, y_valid), n_trials = 50)
print('Best trial: score {},\nparams {}'.format(study2.best_trial.value, study2.best_trial.params))
'''

#### 최적의 parameter로 모델링

In [None]:
params2 = {'n_estimators': 3015, 'max_depth': 19, 'min_child_weight': 1, 'gamma': 0.02,
           'learning_rate': 0.014374633885619992, 'colsample_bytree': 0.7, 'subsample': 0.9,
           'lambda': 4.673092181226084, 'alpha': 0.006110253574773731}

In [None]:
xgboost2 = XGBRegressor(random_state = 1, tree_method = 'gpu_hist',
                        **params2)

In [None]:
xgboost2.fit(X_train_scale, y_train,
             eval_metric = 'mae', eval_set = [(X_valid_scale, y_valid)], early_stopping_rounds = 100,
             verbose = 100)

In [77]:
# validation set에 대한 예측값으로 계산한 mae
y_pred = xgboost2.predict(X_valid_scale)
valid_mae = mean_absolute_error(y_valid, y_pred)
print('valid set에서의 mae :', valid_mae)

valid set에서의 mae : 0.49829781669686307


In [78]:
# 예측값을 반올림 한 뒤의 mae
y_pred = xgboost2.predict(X_valid_scale)
y_pred_round = y_pred.round(0)
valid_mae2 = mean_absolute_error(y_valid, y_pred_round)
print('valid set에서의 mae(반올림 한 뒤) :', valid_mae2)

valid set에서의 mae(반올림 한 뒤) : 0.4417363889899357


In [None]:
# 모델 저장
joblib.dump(xgboost2, 'model_2.pkl')

## 모델 3.

전처리를 마친 데이터에 시간, 공휴일 여부 변수를 추가

holiday(공휴일 여부) 별로 나누어서 모델링

In [79]:
StationB = test_df2[test_df2['ru_id'] == 'BaseStationB']
StationJ = test_df2[test_df2['ru_id'] == 'BaseStationJ']
StationB.reset_index(drop = True, inplace = True)
StationJ.reset_index(drop = True, inplace = True)

#### holiday 여부로 데이터 분리

In [80]:
train_h = train_df2[train_df2['holiday'] == 1]
train_n = train_df2[train_df2['holiday'] == 0]

B_h = StationB[StationB['holiday'] == 1]
B_n = StationB[StationB['holiday'] == 0]
J_h = StationJ[StationJ['holiday'] == 1]
J_n = StationJ[StationJ['holiday'] == 0]

#### Optuna를 이용하여 parameter 튜닝 진행(예시 코드)

In [None]:
'''
def objectiveXGB(trial: Trial, X_train, y_train, X_valid, y_valid):
    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 100, 4000),
        'max_depth': trial.suggest_int('max_depth', 4, 36),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'gamma': trial.suggest_discrete_uniform('gamma', 0.01, 0.5, 0.01),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'random_state': 1
    }

    model = XGBRegressor(**param)
    xgb_model = model.fit(X_train, y_train, verbose = False,
                          eval_metric = 'mae', eval_set = [(X_valid, y_valid)], early_stopping_rounds = 100)

    ## MAE로 Loss 계산
    score = mean_absolute_error(xgb_model.predict(X_valid), y_valid)

    return score
'''

#### holiday = 1 모델링

In [81]:
data_h = train_h.iloc[:, 2:]
X_h, y_h = data_h.iloc[:, :-1], data_h.iloc[:, [-1]]

# train : valid = 7 : 3 분할
X_h_train, X_h_valid, y_h_train, y_h_valid = train_test_split(X_h, y_h, test_size = 0.3, shuffle = False, random_state = 30)
print(X_h_train.shape, X_h_valid.shape)

(29030, 36) (12442, 36)


In [82]:
ss3 = StandardScaler()

X_h_train_scale = X_h_train.drop(['holiday'], axis = 1)
X_h_valid_scale = X_h_valid.drop(['holiday'], axis = 1)

X_h_train_scale = ss3.fit_transform(X_h_train_scale)
X_h_valid_scale = ss3.transform(X_h_valid_scale)

print(X_h_train_scale.shape, X_h_valid_scale.shape)

(29030, 35) (12442, 35)


In [None]:
'''
study3 = optuna.create_study(direction = 'minimize', sampler = TPESampler())

study3.optimize(lambda trial : objectiveXGB(trial, X_h_train_scale, y_h_train, X_h_valid_scale, y_h_valid), n_trials = 50)
print('Best trial: score {},\nparams {}'.format(study3.best_trial.value, study3.best_trial.params))
'''

In [None]:
'''
xgboost3 = XGBRegressor(random_state = 1,
                        **study3.best_trial.params)
'''

In [83]:
params3 = {'n_estimators': 2814, 'max_depth': 19, 'min_child_weight': 5, 'gamma': 0.09,
           'learning_rate': 0.010568097997401979, 'colsample_bytree': 0.8, 'subsample': 0.7,
           'lambda': 5.94489114614049, 'alpha': 0.19738241566039705}

In [84]:
xgboost3 = XGBRegressor(random_state = 1, tree_method = 'gpu_hist',
                        **params3)

In [85]:
xgboost3.fit(X_h_train_scale, y_h_train,
             eval_metric = 'mae', eval_set = [(X_h_valid_scale, y_h_valid)], early_stopping_rounds = 100,
             verbose = 100)

[0]	validation_0-mae:1.77785
[100]	validation_0-mae:0.67779
[200]	validation_0-mae:0.51229
[300]	validation_0-mae:0.50549
[360]	validation_0-mae:0.50938


In [86]:
# validation set에 대한 예측값으로 계산한 mae
y_pred = xgboost3.predict(X_h_valid_scale)
valid_mae = mean_absolute_error(y_h_valid, y_pred)
print('valid set에서의 mae :', valid_mae)

valid set에서의 mae : 0.5039922573539765


In [87]:
# 예측값을 반올림 한 뒤의 mae
y_pred = xgboost3.predict(X_h_valid_scale)
y_pred_round = y_pred.round(0)
valid_mae2 = mean_absolute_error(y_h_valid, y_pred_round)
print('valid set에서의 mae(반올림 한 뒤) :', valid_mae2)

valid set에서의 mae(반올림 한 뒤) : 0.4475968493811284


In [None]:
# 모델 저장
joblib.dump(xgboost3, 'model_3.pkl')

#### holiday = 0 모델링

In [88]:
data_n = train_n.iloc[:, 2:]
X_n, y_n = data_n.iloc[:, :-1], data_n.iloc[:, [-1]]

# train : valid = 7 : 3 분할
X_n_train, X_n_valid, y_n_train, y_n_valid = train_test_split(X_n, y_n, test_size = 0.3, shuffle = False, random_state = 30)
print(X_n_train.shape, X_n_valid.shape)

(67183, 36) (28793, 36)


In [89]:
ss4 = StandardScaler()

X_n_train_scale = X_n_train.drop(['holiday'], axis = 1)
X_n_valid_scale = X_n_valid.drop(['holiday'], axis = 1)

X_n_train_scale = ss4.fit_transform(X_n_train_scale)
X_n_valid_scale = ss4.transform(X_n_valid_scale)

print(X_n_train_scale.shape, X_n_valid_scale.shape)

(67183, 35) (28793, 35)


In [None]:
'''
study4 = optuna.create_study(direction = 'minimize', sampler = TPESampler())

study4.optimize(lambda trial : objectiveXGB(trial, X_n_train_scale, y_n_train, X_n_valid_scale, y_n_valid), n_trials = 50)
print('Best trial: score {},\nparams {}'.format(study4.best_trial.value, study4.best_trial.params))
'''

In [None]:
'''
xgboost4 = XGBRegressor(random_state = 1,
                        **study4.best_trial.params)
'''

In [90]:
params4 = {'n_estimators': 2944, 'max_depth': 35, 'min_child_weight': 5, 'gamma': 0.42000000000000004,
           'learning_rate': 0.009641077051022224, 'colsample_bytree': 1.0, 'subsample': 0.8,
           'lambda': 2.5880158011337313, 'alpha': 2.336338346210088}

In [91]:
xgboost4 = XGBRegressor(random_state = 1, tree_method = 'gpu_hist',
                        **params4)

In [92]:
xgboost4.fit(X_n_train_scale, y_n_train,
             eval_metric = 'mae', eval_set = [(X_n_valid_scale, y_n_valid)], early_stopping_rounds = 100,
             verbose = 100)

[0]	validation_0-mae:1.81069
[100]	validation_0-mae:0.74105
[200]	validation_0-mae:0.52242
[300]	validation_0-mae:0.49871
[400]	validation_0-mae:0.50005
[415]	validation_0-mae:0.50047


In [93]:
# validation set에 대한 예측값으로 계산한 mae
y_pred = xgboost4.predict(X_n_valid_scale)
valid_mae = mean_absolute_error(y_n_valid, y_pred)
print('valid set에서의 mae :', valid_mae)

valid set에서의 mae : 0.49835927376579703


In [94]:
# 예측값을 반올림 한 뒤의 mae
y_pred = xgboost4.predict(X_n_valid_scale)
y_pred_round = y_pred.round(0)
valid_mae2 = mean_absolute_error(y_n_valid, y_pred_round)
print('valid set에서의 mae(반올림 한 뒤) :', valid_mae2)

valid set에서의 mae(반올림 한 뒤) : 0.44139200500121556


In [None]:
# 모델 저장
joblib.dump(xgboost4, 'model_4.pkl')

# test 데이터셋 예측

## 모델별 예측

#### 모델 1.

In [None]:
label_sample = pd.read_csv('Q1_label_sample.csv')

StationB = test_df[test_df['ru_id'] == 'BaseStationB'].iloc[:, 2:]
StationJ = test_df[test_df['ru_id'] == 'BaseStationJ'].iloc[:, 2:]

B_scale = ss.transform(StationB)
B_pred = xgboost.predict(B_scale)

J_scale = ss.transform(StationJ)
J_pred = xgboost.predict(J_scale)

label_sample['BaseStationB'] = B_pred
label_sample['BaseStationJ'] = J_pred

In [None]:
label_sample_round = label_sample.round(0)
label_sample_round.to_csv('model1.csv', index = False)

#### 모델 2.

In [None]:
label_sample = pd.read_csv('Q1_label_sample.csv')

StationB = test_df2[test_df2['ru_id'] == 'BaseStationB'].iloc[:, 2:]
StationJ = test_df2[test_df2['ru_id'] == 'BaseStationJ'].iloc[:, 2:]

B_scale = ss2.transform(StationB)
B_pred = xgboost2.predict(B_scale)

J_scale = ss2.transform(StationJ)
J_pred = xgboost2.predict(J_scale)

label_sample['BaseStationB'] = B_pred
label_sample['BaseStationJ'] = J_pred

In [None]:
label_sample_round = label_sample.round(0)
label_sample_round.to_csv('model2.csv', index = False)

#### 모델 3.

In [None]:
# holiday = 1인 모델
StationB_h = B_h.iloc[:, 2:]
StationJ_h = J_h.iloc[:, 2:]

B_h_scale = StationB_h.drop(['holiday'], axis = 1)
J_h_scale = StationJ_h.drop(['holiday'], axis = 1)

B_h_scale = ss3.transform(B_h_scale)
J_h_scale = ss3.transform(J_h_scale)

B_h_pred = xgboost3.predict(B_h_scale)
J_h_pred = xgboost3.predict(J_h_scale)

StationB_h['BaseStationB'] = B_h_pred
StationJ_h['BaseStationJ'] = J_h_pred

In [None]:
# holiday = 0인 모델
StationB_n = B_n.iloc[:, 2:]
StationJ_n = J_n.iloc[:, 2:]

B_n_scale = StationB_n.drop(['holiday'], axis = 1)
J_n_scale = StationJ_n.drop(['holiday'], axis = 1)

B_n_scale = ss4.transform(B_n_scale)
J_n_scale = ss4.transform(J_n_scale)

B_n_pred = xgboost4.predict(B_n_scale)
J_n_pred = xgboost4.predict(J_n_scale)

StationB_n['BaseStationB'] = B_n_pred
StationJ_n['BaseStationJ'] = J_n_pred

In [None]:
B = pd.concat([StationB_h, StationB_n])
B = B.iloc[:, [-1]]
B = B.sort_index()

J = pd.concat([StationJ_h, StationJ_n])
J = J.iloc[:, [-1]]
J = J.sort_index()

In [None]:
prediction = pd.concat([B, J], axis = 1)
label_sample['BaseStationB'] = prediction['BaseStationB']
label_sample['BaseStationJ'] = prediction['BaseStationJ']
label_sample_round = label_sample.round(0)
label_sample_round.to_csv('model3.csv', index = False)

## 최종 예측값

모델 3개에서 예측한 값들을 voting을 통해 하나로 합침

In [None]:
model1 = pd.read_csv('model1.csv')
model2 = pd.read_csv('model2.csv')
model3 = pd.read_csv('model3.csv')

In [None]:
label_df = model1.iloc[:, [0]]

In [None]:
B = pd.concat([model1['BaseStationB'], model2['BaseStationB'], model3['BaseStationB']], axis = 1).T
J = pd.concat([model1['BaseStationJ'], model2['BaseStationJ'], model3['BaseStationJ']], axis = 1).T

In [None]:
# 하나의 datetime에 대한 여러 개의 예측값 중 가장 많은 개수를 가진 값을 최종 값으로 결정
B_pred = []
J_pred = []
for i in range(17181) :
    B_pred.append(B.iloc[:, i].value_counts().idxmax())
    J_pred.append(J.iloc[:, i].value_counts().idxmax())
label_df['BaseStationB'] = B_pred
label_df['BaseStationJ'] = J_pred

In [None]:
label_df.to_csv('predict_final.csv', index = False)