# 1. Data setting

In [1]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from IPython.display import clear_output 

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error

import torch
'''# font settings for graph visualization
plt.rc('font', family='NanumBarunGothic')
'''

##################### 0. Cuda setting

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')

print('USING pyTorch Version:', torch.__version__, ' Device:', DEVICE)


##################### 1. Data Load
# run when runtime is disconnected
_file_dir = '/content/drive/MyDrive/DACON_contest/DACON_JEJU'


train_file_dir = '/content/drive/MyDrive/DACON_contest/DACON_JEJU/train.parquet'
test_file_dir = '/content/drive/MyDrive/DACON_contest/DACON_JEJU/test.parquet'

train_data_org = pd.read_parquet(train_file_dir)
test_data_org = pd.read_parquet(test_file_dir)
info_data = pd.read_csv(_file_dir + '/data_info.csv')

_save_basic_p_train = _file_dir + '/basic_p_train.pkl'
_save_basic_p_test = _file_dir + '/basic_p_test.pkl'

train_data = pd.read_pickle(_save_basic_p_train)
test_data = pd.read_pickle(_save_basic_p_test)

print(" ### 1. Data load complete ### ")

## This is the main code ##

##################### 2. set new columns for grouping road names
# Train data
_road_name_list = list(train_data_org['road_name'].values)
train_data_org['road_name_grouped'] = list('0' for i in range(0, len(train_data_org['base_date'].values)))

# grouping
# 먼저 extra road 를 그룹핑하고, 나머지를 변경한다. 
_extra_road = list(train_data_org['road_name_grouped'].str.contains('0')) 
train_data_org.loc[_extra_road, 'road_name_grouped'] = 'extra'

_general_road = list(train_data_org['road_name'].str.contains('일반국도')) 
_country_road = list(train_data_org['road_name'].str.contains('지방도')) 
_unlabelled_road = list(train_data_org['road_name'].str.contains('-'))

train_data_org.loc[_general_road, 'road_name_grouped'] = 'general'
train_data_org.loc[_country_road, 'road_name_grouped'] = 'country'
train_data_org.loc[_unlabelled_road, 'road_name_grouped'] = 'unlabelled'

# Test data
_road_name_list_test = list(test_data_org['road_name'].values)
test_data_org['road_name_grouped'] = list('0' for i in range(0, len(test_data_org['base_date'].values)))

_extra_road_test = list(test_data_org['road_name_grouped'].str.contains('0')) 
test_data_org.loc[_extra_road_test, 'road_name_grouped'] = 'extra'

_general_road_test = list(test_data_org['road_name'].str.contains('일반국도')) 
_country_road_test = list(test_data_org['road_name'].str.contains('지방도')) 
_unlabelled_road_test = list(test_data_org['road_name'].str.contains('-'))

test_data_org.loc[_general_road_test, 'road_name_grouped'] = 'general'
test_data_org.loc[_country_road_test, 'road_name_grouped'] = 'country'
test_data_org.loc[_unlabelled_road_test, 'road_name_grouped'] = 'unlabelled'

print(" ### 2. road name grouping complete ### ")

##################### 3. Feature Engineering

train_data_org['msl_lc'] = train_data_org['maximum_speed_limit'] - (train_data_org['lane_count'] *10)
train_data_org['rr_msl'] = -(train_data_org['road_rating']-100) * 2.5 + train_data_org['maximum_speed_limit']

test_data_org['msl_lc'] = test_data_org['maximum_speed_limit'] - (test_data_org['lane_count'] *10)
test_data_org['rr_msl'] = -(test_data_org['road_rating']-100) * 2.5 + test_data_org['maximum_speed_limit']


##################### 3. Split train data
# Train
train_data_unlabelled = train_data_org[train_data_org['road_name_grouped'] == 'unlabelled']
train_data_country = train_data_org[train_data_org['road_name_grouped'] == 'country']
train_data_extra = train_data_org[train_data_org['road_name_grouped'] == 'extra']
train_data_general = train_data_org[train_data_org['road_name_grouped'] == 'general']

# Test
test_data_unlabelled = test_data_org[test_data_org['road_name_grouped'] == 'unlabelled']
test_data_country = test_data_org[test_data_org['road_name_grouped'] == 'country']
test_data_extra = test_data_org[test_data_org['road_name_grouped'] == 'extra']
test_data_general = test_data_org[test_data_org['road_name_grouped'] == 'general']

print(" ### 3. Data split by road-name-grouped complete ###")

# Label encoding - +1하는 이유는 0을 없애기 위해서.
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted', 'start_node_name', 'end_node_name', 'road_name', 'road_name_grouped']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train_data_org[i])
    train_data_org[i] = le.transform(train_data_org[i]) + 1
    
    for label in np.unique(test_data_org[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_data_org[i] = le.transform(test_data_org[i]) + 1

print("\tshape of train: {} \tshape of test: {}".format(train_data_org.shape, test_data_org.shape))
print(" ### 4. Feature Label encoding complete ### ")

##################### 5. Feature Engineering Plus(피처 추가)

# month and day of week
## str(object) 만to_datetime 이 된다. 그래서 변환 후 설정해줌...
train_data_org['base_date'] = pd.to_datetime(train_data_org['base_date'].apply(str))
train_data_org['Month'] = train_data_org['base_date'].dt.month
train_data_org['Day_of_week'] = train_data_org['base_date'].dt.weekday

test_data_org['base_date'] = pd.to_datetime(test_data_org['base_date'].apply(str))
test_data_org['Month'] = test_data_org['base_date'].dt.month
test_data_org['Day_of_week'] = test_data_org['base_date'].dt.weekday

# weekdays and weekends
_issunday = list(train_data_org['base_date'].dt.weekday == 6)
_issaturday = list(train_data_org['base_date'].dt.weekday == 5)
_isweekends = [sun + sat for sun, sat in zip(_issunday, _issaturday)]
train_data_org['IsWeekends'] = _isweekends

_issunday = list(test_data_org['base_date'].dt.weekday == 6)
_issaturday = list(test_data_org['base_date'].dt.weekday == 5)
_isweekends = [sun + sat for sun, sat in zip(_issunday, _issaturday)]
test_data_org['IsWeekends'] = _isweekends

# hour grouping(새벽, 출근, 오후, 퇴근)
## train
_time_1_1 = list(train_data_org['base_hour'] >= 22)
_time_1_2 = list(train_data_org['base_hour'] < 6)
_time_1 = [1 if a == True or b == True else 0 for a, b in zip(_time_1_1, _time_1_2)] # 23을 넘어가면 and가 안된다...

_time_2_1 = list(train_data_org['base_hour'] >= 6)
_time_2_2 = list(train_data_org['base_hour'] < 11)
_time_2 = [2 if a == True and b == True else 0 for a, b in zip(_time_2_1, _time_2_2)]   # 

_time_3_1 = list(train_data_org['base_hour'] >= 11)
_time_3_2 = list(train_data_org['base_hour'] < 17)
_time_3 = [3 if a == True and b == True else 0 for a, b in zip(_time_3_1, _time_3_2)] # 

_time_4_1 = list(train_data_org['base_hour'] >= 17)
_time_4_2 = list(train_data_org['base_hour'] < 22)
_time_4 = [4 if a == True and b == True else 0 for a, b in zip(_time_4_1, _time_4_2)]  # 

_time_sum = [a + b + c + d for a, b, c, d in zip(_time_1, _time_2, _time_3, _time_4)]  # summary

train_data_org['Hour_grouped'] = _time_sum

## test
_time_1_1 = list(test_data_org['base_hour'] >= 22)
_time_1_2 = list(test_data_org['base_hour'] < 6)
_time_1 = [1 if a == True or b == True else 0 for a, b in zip(_time_1_1, _time_1_2)] # 23을 넘어가면 and가 안된다...

_time_2_1 = list(test_data_org['base_hour'] >= 6)
_time_2_2 = list(test_data_org['base_hour'] < 11)
_time_2 = [2 if a == True and b == True else 0 for a, b in zip(_time_2_1, _time_2_2)]   # 

_time_3_1 = list(test_data_org['base_hour'] >= 11)
_time_3_2 = list(test_data_org['base_hour'] < 17)
_time_3 = [3 if a == True and b == True else 0 for a, b in zip(_time_3_1, _time_3_2)] # 

_time_4_1 = list(test_data_org['base_hour'] >= 17)
_time_4_2 = list(test_data_org['base_hour'] < 22)
_time_4 = [4 if a == True and b == True else 0 for a, b in zip(_time_4_1, _time_4_2)]  # 

_time_sum = [a + b + c + d for a, b, c, d in zip(_time_1, _time_2, _time_3, _time_4)]  # summary

test_data_org['Hour_grouped'] = _time_sum

# distance from city
## train
_distance_jeju_s = list(round((((train_data_org['start_latitude'] - 33.4996)*60*1.85)**2 + ((train_data_org['start_longitude'] - 126.5312)*60*1.85)**2),3))
train_data_org['Dist_from_JEJU_s'] = _distance_jeju_s

_distance_jeju_e = list(round((((train_data_org['end_latitude'] - 33.4996)*60*1.85)**2 + ((train_data_org['end_longitude'] - 126.5312)*60*1.85)**2),3))
train_data_org['Dist_from_JEJU_e'] = _distance_jeju_e

_distance_seogwipo_s = list(round((((train_data_org['start_latitude'] - 33.2541)*60*1.85)**2 + ((train_data_org['start_longitude'] - 126.5601)*60*1.85)**2) ,3))
train_data_org['Dist_from_Seogwipo_s'] = _distance_seogwipo_s

_distance_seogwipo_e = list(round((((train_data_org['end_latitude'] - 33.2541)*60*1.85)**2 + ((train_data_org['end_longitude'] - 126.5601)*60*1.85)**2),3))
train_data_org['Dist_from_Seogwipo_e'] = _distance_seogwipo_e

## test
_distance_jeju_s = list(round((((test_data_org['start_latitude'] - 33.4996)*60*1.85)**2 + ((test_data_org['start_longitude'] - 126.5312)*60*1.85)**2),3))
test_data_org['Dist_from_JEJU_s'] = _distance_jeju_s

_distance_jeju_e = list(round((((test_data_org['end_latitude'] - 33.4996)*60*1.85)**2 + ((test_data_org['end_longitude'] - 126.5312)*60*1.85)**2),3))
test_data_org['Dist_from_JEJU_e'] = _distance_jeju_e

_distance_seogwipo_s = list(round((((test_data_org['start_latitude'] - 33.2541)*60*1.85)**2 + ((test_data_org['start_longitude'] - 126.5601)*60*1.85)**2) ,3))
test_data_org['Dist_from_Seogwipo_s'] = _distance_seogwipo_s

_distance_seogwipo_e = list(round((((test_data_org['end_latitude'] - 33.2541)*60*1.85)**2 + ((test_data_org['end_longitude'] - 126.5601)*60*1.85)**2),3))
test_data_org['Dist_from_Seogwipo_e'] = _distance_seogwipo_e

# 도로 이름의 속도 평균, 중앙 값 (약 50초)

_road_name_encoded_list = list(train_data_org.road_name.unique())

for i in range(0, len(_road_name_encoded_list)):
    ## 해당 도로 속도 평균, max, min 구하기
    _road_name_encoded = _road_name_encoded_list[i]
    _correspond_road_pd = train_data_org[train_data_org['road_name'] == _road_name_encoded]
    _target_list = np.array(list(_correspond_road_pd['target']))

    _target_average = round(np.mean(_target_list), 3)
    _target_median = round(np.median(_target_list), 3)
    _target_max = round(np.max(_target_list), 3)
    _target_min = round(np.min(_target_list), 3)
    _target_max_min_dif = round(abs(_target_max - _target_min), 3)

    ## 컬럼 추가하기
    _corresond_road_list_train = list(train_data_org['road_name'] == _road_name_encoded)
    train_data_org.loc[_corresond_road_list_train, 'target_avg'] = _target_average
    train_data_org.loc[_corresond_road_list_train, 'target_med'] = _target_median

    _corresond_road_list_test = list(test_data_org['road_name'] == _road_name_encoded)
    test_data_org.loc[_corresond_road_list_test, 'target_avg'] = _target_average
    test_data_org.loc[_corresond_road_list_test, 'target_med'] = _target_median


# 6. 시간 * start_longitude
## train
_base_hour_list_train = list(train_data_org['base_hour'])
_base_hour_dawn_train = [100 if a == 0 or a == 4 else 30 for a in _base_hour_list_train]
_slong_list_train = list(train_data_org['start_longitude'] - 126)
_hour_mul_slong_train = [a*b for a, b in zip(_base_hour_dawn_train, _slong_list_train)]

train_data_org['hour_mul_slong'] = _hour_mul_slong_train

## test
_base_hour_list_test = list(test_data_org['base_hour'])
_base_hour_dawn_test = [100 if a == 0 or a == 4 else 30 for a in _base_hour_list_test]
_slong_list_test = list(test_data_org['start_longitude'] - 126)
_hour_mul_slong_test = [a*b for a, b in zip(_base_hour_dawn_test, _slong_list_test)]

test_data_org['hour_mul_slong'] = _hour_mul_slong_test


# 시간 * 시청으로부터의 거리

_hour_mul_slong_j_train = [a*b for a, b in zip(_base_hour_dawn_train, list(train_data_org['Dist_from_JEJU_s']))]
train_data_org['hour_mul_dist_j'] = _hour_mul_slong_j_train

_hour_mul_slong_s_train = [a*b for a, b in zip(_base_hour_dawn_train, list(train_data_org['Dist_from_Seogwipo_s']))]
train_data_org['hour_mul_dist_s'] = _hour_mul_slong_s_train

_hour_mul_slong_j_test = [a*b for a, b in zip(_base_hour_dawn_test, list(test_data_org['Dist_from_JEJU_s']))]
test_data_org['hour_mul_dist_j'] = _hour_mul_slong_j_test

_hour_mul_slong_s_test = [a*b for a, b in zip(_base_hour_dawn_test, list(test_data_org['Dist_from_Seogwipo_s']))]
test_data_org['hour_mul_dist_s'] = _hour_mul_slong_s_test


# lane_count * road_name
train_data_org['lanecount_mul_name'] = train_data_org['lane_count']**2 * train_data_org['road_name']
test_data_org['lanecount_mul_name'] = test_data_org['lane_count']**2 * test_data_org['road_name']


train_data_org = train_data_org.drop(['id', 'road_in_use', 'height_restricted', 'multi_linked', 'base_date', 'vehicle_restricted','connect_code'], axis=1)
test_data_org = test_data_org.drop(['id', 'road_in_use', 'height_restricted', 'multi_linked', 'base_date', 'vehicle_restricted', 'connect_code'], axis=1)


_len_1 = len(train_data_org[train_data_org['road_name_grouped'] == 1])
_len_2 = len(train_data_org[train_data_org['road_name_grouped'] == 2])
_len_3 = len(train_data_org[train_data_org['road_name_grouped'] == 3])
_len_4 = len(train_data_org[train_data_org['road_name_grouped'] == 4])

_road_group_list = [_len_1, _len_2, _len_3, _len_4]

print("length of road name group 1~4: {}, {}, {}, {}".format(_len_1, _len_2, _len_3, _len_4))

for idx, i in enumerate(_road_group_list):
    if(i == 698704):
        print("number {} is country".format(idx+1))
        _country_idx = idx+1
    elif(i == 1285567):
        print("number {} is extra".format(idx+1))
        _extra_idx = idx+1
    elif(i == 2147483):
        print("number {} is general".format(idx+1))
        _general_idx = idx+1
    elif(i == 569463):
        print("number {} is unlabelled".format(idx+1))
        _unlabelled_idx = idx+1

print(" ### 5. Feature Engineering (some features added) complete ### ")



USING pyTorch Version: 1.12.1+cu113  Device: cuda
 ### 1. Data load complete ### 
 ### 2. road name grouping complete ### 
 ### 3. Data split by road-name-grouped complete ###
	shape of train: (4701217, 27) 	shape of test: (291241, 26)
 ### 4. Feature Label encoding complete ### 
length of road name group 1~4: 698704, 1285567, 2147483, 569463
number 1 is country
number 2 is extra
number 3 is general
number 4 is unlabelled
 ### 5. Feature Engineering (some features added) complete ### 


# 2. Model trianing XGB

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

##### Road general

#1. general
# Trial 0 finished with value: 3.0178409360571536 and parameters: 
# {'n_estimators': 1098, 'max_depth': 13, 'min_child_weight': 69, 'gamma': 2, 
# 'colsample_bytree': 0.8, 'lambda': 0.00013294769198806425, 'alpha': 0.018851716895628944, 'subsample': 0.6}. 


train_data_general_optuna = train_data_org[train_data_org['road_name_grouped'] == _general_idx]

X_g = train_data_general_optuna.drop(['target'], axis=1)
Y_g = train_data_general_optuna['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_g, Y_g, test_size=0.1, shuffle=True, random_state=11)
print('X_train shape: {}, \tX_valid shape: {},\ny_train shape: {}, \ty_valid shape: {}'.format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

XGB_general = XGBRegressor(gpu_id = 0, tree_method='gpu_hist', n_jobs=-1, n_estimators=1098, 
                           max_depth=13, min_child_weight=69, gamma=2, colsample_bytree=0.8, reg_lambda=0.000133, 
                           reg_alpha=0.0189, subsample=0.6, verbosity=1).fit(X_train, y_train)

valid_pred = XGB_general.predict(X_valid)
MAE = mean_absolute_error(y_valid, valid_pred)
print('y_valid prediction MAE of general idx is {}'.format(MAE))


X_train shape: (1932734, 33), 	X_valid shape: (214749, 33),
y_train shape: (1932734,), 	y_valid shape: (214749,)


In [2]:
##### Road country

# Trial 1 finished with value: 2.881378921865521 and parameters: {'n_estimators': 1975, 'max_depth': 7, 'min_child_weight': 250, 
# 'gamma': 5, 'colsample_bytree': 1.0, 'lambda': 0.014419524884176096, 'alpha': 1.1394561107249395, 'subsample': 1.0}. Best is trial 0 with value: 2.748566196751375.

# 2. country
# Trial 3 finished with value: 2.719350625520355 and parameters: 
# {'n_estimators': 3190, 'max_depth': 14, 'min_child_weight': 18, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 0.005825596412374463, 'alpha': 0.1807523789855458, 'subsample': 0.8}. 
# Best is trial 3 with value: 2.719350625520355.
# 아주 매력적인 숫자인데 ㅋㅋ 아쉽군 xgboost가 오래걸리는 모델같음
# 그래서 lightGBM으로 튜닝하고 싶었는데 너무 오래걸리더라 튜닝이
# 그래서 안하고 그냥 저 튜닝된 숫자(xgb)로 모델 학습시켜서 내려고했는데 이지경 ㅠㅡㅠ
# 아마 xgboost 이거 gpu 안쓰나?
train_data_country_optuna = train_data_org[train_data_org['road_name_grouped'] == _country_idx]

X_c = train_data_country_optuna.drop(['target'], axis=1)
Y_c = train_data_country_optuna['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_c, Y_c, test_size=0.1, shuffle=True, random_state=11)
print('X_train shape: {}, \tX_valid shape: {},\ny_train shape: {}, \ty_valid shape: {}'.format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

XGB_country = XGBRegressor(gpu_id = 0, tree_method='gpu_hist', n_jobs=-1, n_estimators=1975, max_depth=7, min_child_weight=250, gamma=5, 
                           colsample_bytree=1.0, reg_lambda=0.01442, reg_alpha=1.13946, subsample=1.0, verbosity=2).fit(X_train, y_train)

valid_pred = XGB_country.predict(X_valid)

MAE = mean_absolute_error(y_valid, valid_pred)
print('y_valid prediction MAE of country idx is {}'.format(MAE))

X_train shape: (628833, 33), 	X_valid shape: (69871, 33),
y_train shape: (628833,), 	y_valid shape: (69871,)
y_valid prediction MAE of country idx is 2.768038034647235


In [3]:
##### Road Extra

# 3. extra
# 그냥 lightGBM -> 3.8정도였던듯.
# Best trial: score 3.3106561411088897,
# params {'n_estimators': 2573, 'max_depth': 13, 'min_child_weight': 142, 'gamma': 1, 
# 'colsample_bytree': 0.6, 'lambda': 0.002211116337529265, 'alpha': 5.5786841254307905e-05, 'subsample': 0.8}


train_data_extra_optuna = train_data_org[train_data_org['road_name_grouped'] == _extra_idx]

X_e = train_data_extra_optuna.drop(['target'], axis=1)
Y_e = train_data_extra_optuna['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_e, Y_e, test_size=0.1, shuffle=True, random_state=11)
print('X_train shape: {}, \tX_valid shape: {},\ny_train shape: {}, \ty_valid shape: {}'.format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

XGB_extra = XGBRegressor(gpu_id = 0, tree_method='gpu_hist', n_jobs=-1, n_estimators=2573, max_depth=13, min_child_weight=142, gamma=1, 
                           colsample_bytree=0.6, reg_lambda=0.00221, reg_alpha=5.5787e-05, subsample=0.8, verbosity=1).fit(X_train, y_train)

valid_pred = XGB_extra.predict(X_valid)

MAE = mean_absolute_error(y_valid, valid_pred)
print('y_valid prediction MAE of extra idx is {}'.format(MAE))

X_train shape: (1157010, 33), 	X_valid shape: (128557, 33),
y_train shape: (1157010,), 	y_valid shape: (128557,)


KeyboardInterrupt: ignored

In [4]:
##### Road Unlabelled

# 4. unlabelled
# Best trial: score 2.9646844485918584,
# params {'n_estimators': 3065, 'max_depth': 12, 'min_child_weight': 74, 'gamma': 1, 
# 'colsample_bytree': 1.0, 'lambda': 1.613367056357903e-05, 'alpha': 6.689301433489457, 'subsample': 1.0}

train_data_unlabelled_optuna = train_data_org[train_data_org['road_name_grouped'] == _unlabelled_idx]

X_u = train_data_unlabelled_optuna.drop(['target'], axis=1)
Y_u = train_data_unlabelled_optuna['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_u, Y_u, test_size=0.1, shuffle=True, random_state=11)
print('X_train shape: {}, \tX_valid shape: {},\ny_train shape: {}, \ty_valid shape: {}'.format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

XGB_unlabelled = XGBRegressor(gpu_id = 0, tree_method='gpu_hist', n_jobs=-1, n_estimators=3065, max_depth=12, min_child_weight=74, gamma=1, 
                           colsample_bytree=1.0, reg_lambda=1.613e-05, reg_alpha=6.6893, subsample=1.0).fit(X_train, y_train)

valid_pred = XGB_unlabelled.predict(X_valid)

MAE = mean_absolute_error(y_valid, valid_pred)
print('y_valid prediction MAE of unlabelled idx is {}'.format(MAE))

X_train shape: (512516, 33), 	X_valid shape: (56947, 33),
y_train shape: (512516,), 	y_valid shape: (56947,)


KeyboardInterrupt: ignored

In [6]:
# 제출용 - 각 road 별 idx 리스트 만들기
test_data_org_general_idx = list(test_data_org[test_data_org['road_name_grouped'] == _general_idx].index)
test_data_org_country_idx = list(test_data_org[test_data_org['road_name_grouped'] == _country_idx].index)
test_data_org_extra_idx = list(test_data_org[test_data_org['road_name_grouped'] == _extra_idx].index)
test_data_org_unlabelled_idx =list(test_data_org[test_data_org['road_name_grouped'] == _unlabelled_idx].index)

# 테스트 데이터 - 각 road 별 데이터 뽑기
test_data_org_general = test_data_org[test_data_org['road_name_grouped'] == _general_idx]
test_data_org_country = test_data_org[test_data_org['road_name_grouped'] == _country_idx]
test_data_org_extra = test_data_org[test_data_org['road_name_grouped'] == _extra_idx]
test_data_org_unlabelled = test_data_org[test_data_org['road_name_grouped'] == _unlabelled_idx]

# 테스트 데이터에 예측 값 계산하기
submit_pred_general = XGB_general.predict(test_data_org_general)
submit_pred_country = XGB_country.predict(test_data_org_country)
submit_pred_extra = XGB_extra.predict(test_data_org_extra)
submit_pred_unlabelled = XGB_unlabelled.predict(test_data_org_unlabelled)

# 제출 파일에 덮어쓰기
sample_dir = '/content/drive/MyDrive/DACON_contest/DACON_JEJU' + '/sample_submission.csv'

sample_submission = pd.read_csv(sample_dir)
print("submission org length: ", len(sample_submission))
sample_submission.loc[test_data_org_general_idx, 'target'] = submit_pred_general
sample_submission.loc[test_data_org_country_idx, 'target'] = submit_pred_country
sample_submission.loc[test_data_org_extra_idx, 'target'] = submit_pred_extra
sample_submission.loc[test_data_org_unlabelled_idx, 'target'] = submit_pred_unlabelled

# 덮어쓴 파일 저장하기
sample_submission.to_csv("/content/drive/MyDrive/DACON_contest/DACON_JEJU/submit_1113_xgb_optuna_1.csv", index = False)

submission length:  291241


In [None]:
### 시간이 남는다면 이 모델도 돌려서 저장하고싶어요
##### Road country

train_data_country_optuna = train_data_org[train_data_org['road_name_grouped'] == _country_idx]

X_c2 = train_data_country_optuna.drop(['target'], axis=1)
Y_c2 = train_data_country_optuna['target']

X_train, X_valid, y_train, y_valid = train_test_split(X_c2, Y_c2, test_size=0.1, shuffle=True, random_state=11)
print('X_train shape: {}, \tX_valid shape: {},\ny_train shape: {}, \ty_valid shape: {}'.format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))

XGB_country_2 = XGBRegressor(gpu_id = 0, tree_method='gpu_hist', n_jobs=-1, n_estimators=3190, max_depth=14, min_child_weight=18, gamma=3, 
                           colsample_bytree=0.6, reg_lambda=0.005826, reg_alpha=0.18075, subsample=0.8).fit(X_train, y_train)

valid_pred = XGB_country_2.predict(X_valid)

MAE = mean_absolute_error(y_valid, valid_pred)
print('y_valid prediction MAE of country2 idx is {}'.format(MAE))