# 패키지 설치

In [None]:
# Optuna 설치
!pip install --quiet --no-cache-dir git+https://github.com/optuna/optuna

# XGB GPU 버전 설치
!pip uninstall --quiet -y xgboost
!pip install --quiet xgboost

# LGBM GPU 버전 설치
! git clone --recursive https://github.com/Microsoft/LightGBM
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for UNKNOWN (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'LightGBM'...
remote: Enumerating objects: 29207, done.[K
remote: Counting objects: 100% (3082/3082), done.[K
remote: Compressing objects: 100% (277/277), done.[K
remote: Total 29207 (delta 2942), reused 2841 (delta 2805), pack-reused 26125[K
Receiving objects: 100% (29207/29207), 20.38 MiB | 27.83 MiB/s, done.
Resolving deltas: 100% (21708/21708), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https

In [None]:
!pip install haversine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting haversine
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.0


# Data 불러오기

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from haversine import haversine
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import gc
from google.colab import drive
drive.mount('/content/drive')
PATH = "/content/drive/MyDrive/KUBIG/콘테스트"


def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')


csv_to_parquet(PATH+'/train.csv', 'train')
csv_to_parquet(PATH+'/test.csv', 'test')


train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

Mounted at /content/drive
train Done.
test Done.


# Preprocessing

In [None]:
!pip install optuna

In [None]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

######## 날짜 ########
## id, height_restricted, vehicle_restricted은 전 데이터에 걸쳐 0이므로 drop
train.drop(columns=['id', 'height_restricted', 'vehicle_restricted'], inplace=True)
test.drop(columns=['id', 'height_restricted', 'vehicle_restricted'], inplace=True)


## base_date 전처리- date type으로 처리
train['date'] = pd.to_datetime(train['base_date'], format='%Y%m%d')
test['date'] = pd.to_datetime(train['base_date'], format='%Y%m%d')


## date 전처리- 년, 월, 일 컬럼 추가
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day


## date 전처리- 년+월만 있는 컬럼 추가 (입도 현황)
train['year_month'] = train['base_date'].astype(str).str[:6]
test['year_month'] = test['base_date'].astype(str).str[:6]


## date 전처리- 계절 컬럼 추가 (반복문보다 함수화가 더 빨라서 이렇게 진행)
def seasons(i) :
    if i in [3, 4, 5] :
        return 'spring'
    elif i in [6, 7, 8] :
        return 'summer'
    elif i in [9, 10, 11] :
        return 'fall'
    else :
        return 'winter'

train['season'] = train['month'].apply(seasons)
test['season'] = test['month'].apply(seasons)



## 2021, 2022 공휴일 +- 1일에 대한 리스트
holidays = ['20210101', '20210102', 
            '20210210', '20210211', '20210212', '20210213', '20210214'
            '20210228', '20210301', '20210302',
            '20210504', '20210505', '20210506',
            '20210518', '20210519', '20210520',
            '20210607', '20210606', '20210607',
            '20210814', '20210815', '20210816', '20210817'
            '20210919', '20210920', '20210921', '20210922', '20210923'
            '20211002', '20211003', '20211004', '20211005',
            '20211008', '20211009', '20211010', '20211011', '20211012',
            '20211226', '20211225', '20211227'

            '20211231', '20220101', '20220102',
            '20220130', '20220131', '20220201', '20220202', '20220203',
            '20220228', '20220301', '20220302',
            '20220308', '20220309', '20220310',  
            '20220504', '20220505', '20220506',
            '20220507', '20220508', '20220509',
            '20220531', '20220601', '20220602',
            '20220814', '20220815', '20220816',
            '20220908', '20220909', '20220910', '20220911', '20220912', '20220913',
            '20221002', '20221003', '20221004',
            '20221008', '20221009', '20221010', '20221011',
            '20221224', '20221225', '20221226']

train['holiday'] = train['base_date'].astype(str).isin(holidays)
test['holiday'] = test['base_date'].astype(str).isin(holidays)


## 휴가철
vacation = [12, 1, 2, 6, 7, 8]

def vacations(i) :
    if i in vacation :
        return 'yes'
    else :
        return 'no'

train['vacation'] = train['month'].apply(vacations)
test['vacation'] = test['month'].apply(vacations)







######## 요일 ########
## 요일 -> 숫자 (일요일 시작)
days = {'일':1, '월':2, '화':3, '수':4, '목':5, '금':6, '토':7}

train['day_of_week'] = train['day_of_week'].replace(days)
test['day_of_week'] = test['day_of_week'].replace(days)


## 주말인가?
def weekends(i) :
    if i in [1, 7] :
        return 'yes'
    else :
        return 'no'

train['weekend'] = train['day_of_week'].apply(weekends)
test['weekend'] = test['day_of_week'].apply(weekends)






######## 시간대 ########
## 오전, 오후, 저녁, 새벽으로 나눔 (6시간 단위)
def times(i) :
    if i in [6, 7, 8, 9, 10, 11] :
        return 'morning'
    elif i in [12, 13, 14, 15, 16, 17] :
        return 'noon'
    elif i in [18, 19, 20, 21, 22, 23] :
        return 'night'
    else :
        return 'dawn'

train['time'] = train['base_hour'].apply(times)
test['time'] = test['base_hour'].apply(times)


## 등하교, 출퇴근, 주말은 전부 rush_hour이 아닌거로
def rush_hours(i) :
    if i in [7, 8, 9, 17, 18, 19] :
        return 'yes'
    else :
        return 'no'

train['rush_hour'] = train['base_hour'].loc[train['weekend']=='no'].apply(rush_hours)
train['rush_hour'].fillna('no', inplace=True)
test['rush_hour'] = test['base_hour'].loc[train['weekend']=='no'].apply(rush_hours)
test['rush_hour'].fillna('no', inplace=True)








######## 도로 관련 데이터 ########
## multi_linked, connect_code, start_node_name, end_node_name drop
train.drop(columns=['multi_linked', 'connect_code', 'start_node_name', 'end_node_name'], inplace=True)
test.drop(columns=['multi_linked', 'connect_code', 'start_node_name', 'end_node_name'], inplace=True)


## maximum_speed_limit : 최고 제한 속도보다 target이 빠른 경우 제거 (만약 성능 떨어지면 빼고 해도됨)
#train[(train['maximum_speed_limit']<train['target'])].head()
#print(len(train[(train['maximum_speed_limit']<train['target'])]) / len(train)) # 대략 9.7%의 데이터 drop 예정
#train = train.loc[train['maximum_speed_limit']>=train['target']]


## weight_restricted 범주화
print(train['weight_restricted'].unique())
print(test['weight_restricted'].unique())


## 1. 무게별로 범주 다르게
weights = {0:0, 32400:1, 43200:2, 50000:3}
train['weight_restricted'] = train['weight_restricted'].astype(int).replace(weights)
test['weight_restricted'] = test['weight_restricted'].astype(int).replace(weights)


## 2. target이 비슷한것끼리 범주화 (1을 쓰지 않는 경우 이렇게도 활용 가능)
# weights = {0:0, 32300:1, 50000:1, 43200:2}
# train['weight_restricted'] = train['weight_restricted'].replace(weights)
# test['weight_restricted'] = test['weight_restricted'].replace(weights)


## 3. road_type과 road_rating 묶어서 범주로 (만약 따로따로 넣는게 낫다면 해당 코드 안써도 됨)
train['road_type_rating'] = train['road_rating'].astype(str)+'_'+train['road_type'].astype(str)
test['road_type_rating'] = test['road_rating'].astype(str)+'_'+test['road_type'].astype(str)


## road_type, road_rating drop (이건 필요하다고 생각하면 각주 해제 후 사용할 것)
# train.drop(columns=['road_rating', 'road_type'], inplace=True)
# test.drop(columns=['road_rating', 'road_type'], inplace=True)


## start_turn_restricted와 end_turn_restricted encoding
yes_no = {'없음':0, '있음':1}
train['start_turn_restricted'] = train['start_turn_restricted'].replace(yes_no)
test['start_turn_restricted'] = test['start_turn_restricted'].replace(yes_no)
train['end_turn_restricted'] = train['end_turn_restricted'].replace(yes_no)
test['end_turn_restricted'] = test['end_turn_restricted'].replace(yes_no)


## 이를 합친 turn_restricted
train['turn_restricted'] = train['start_turn_restricted'] + train['end_turn_restricted']
test['turn_restricted'] = test['start_turn_restricted'] + train['end_turn_restricted']


## start_turn_restricted와 end_turn_restricted drop (필요시 주석 해제 후 사용)
# train.drop(columns=['start_turn_restricted', 'end_turn_restricted'], inplace=True)
# test.drop(columns=['start_turn_restricted', 'end_turn_restricted'], inplace=True)


## 2. 거리 계산 함수
def distance(x) :
    start_location = tuple(zip(x['start_latitude'], x['start_longitude']))
    end_location = tuple(zip(x['end_latitude'], x['end_longitude']))
    hsine = [haversine(s, e, unit='km') for s, e in zip(start_location, end_location)]
    return hsine
train['road_distance'] = distance(train)
test['road_distance'] = distance(test)


## 3. 구역 Clustering
def make_cluster(x) : 
    x_lat_long = x[['start_latitude', 'start_longitude']]

    k_mean = KMeans(n_clusters=6, max_iter=1000, random_state = 31)
    location_cluster = k_mean.fit_predict(x_lat_long)
    return location_cluster

train['location_cluster'] = make_cluster(train)
test['location_cluster'] = make_cluster(test)


## 도로명에 - 라는 결측치가 존재함으로 확인되므로 확실한 시작-끝 지점 위-경도를 사용
train['road_lat_long'] = train['start_latitude'].astype(str)+'_'+train['start_longitude'].astype(str)+'_'+train['end_latitude'].astype(str)+'_'+train['end_longitude'].astype(str)
test['road_lat_long'] = test['start_latitude'].astype(str)+'_'+test['start_longitude'].astype(str)+'_'+test['end_latitude'].astype(str)+'_'+test['end_longitude'].astype(str)



# Label encoding

str_col = ['road_name', 'year_month', 'season', 'vacation', 'weekend', 'time', 'rush_hour', 'road_type_rating', 'road_lat_long', 'holiday', 'year_month']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])



## 위-경도로된 도로별 시간별 / 요일별 속도 평균 평균 계산
## 도로별 시간별 속도 평균
train['road_hour'] = train['road_lat_long'].astype(str)+'_'+train['base_hour'].astype(str)
test['road_hour'] = test['road_lat_long'].astype(str)+'_'+test['base_hour'].astype(str)
hour_mean = train.groupby(['road_hour'])[['target']].agg('mean').reset_index()
hour_mean.columns = ['road_hour', 'road_hour_mean']
train = train.merge(hour_mean, how='left', on='road_hour')
test = test.merge(hour_mean, how='left', on='road_hour')


## 도로별 요일별 속도 평균
train['road_day'] = train['road_lat_long'].astype(str)+'_'+train['day_of_week'].astype(str)
test['road_day'] = test['road_lat_long'].astype(str)+'_'+test['day_of_week'].astype(str)
hour_mean = train.groupby(['road_day'])[['target']].agg('mean').reset_index()
hour_mean.columns = ['road_day', 'road_day_mean']
train = train.merge(hour_mean, how='left', on='road_day')
test = test.merge(hour_mean, how='left', on='road_day')


## 이상치 제거 함수 - 이상치를 제거해야할 column을 못찾겠어서 사용하지 않았으나 필요시 사용해볼것
# def outlier_detect(cols, df) :
#     for col in cols :
#         Q1 = df[col].quantile(0.25)
#         Q3 = df[col].quantile(0.75)
#         iqr = Q3 - Q1
#         df = df[(df[col] <= 1.5 * Q3) & (df[col] >= 1.5 * Q1)]
#         df = df.reset_index(drop=True)
#     return df


## 이상치를 제거해야하는 columns
# remove_outliers_cols = []








######## 외부 데이터 ########
## 입도 관관갱 컬럼 (inflow)
#inflow_train = pd.DataFrame({'year_month':['202109', '202110', '202111', '202112', '202201', '202202',
#       '202203', '202205', '202206', '202207'],
#       'inflow':[872396, 1222094, 1204344, 1090607, 1170802, 1029503, 873086, 1306537, 1283470, 1263332]})
#inflow_test = pd.DataFrame({'year_month' : '202208', 'inflow': [1281608]})

#train = train.merge(inflow_train, how='left', on='year_month')
#test = test.merge(inflow_test, how='left', on='year_month')

[32400.     0. 43200. 50000.]
[    0. 43200. 32400. 50000.]


In [None]:
train = train[['base_date', 'day_of_week', 'base_hour', 'lane_count', 'road_rating',
       'road_name', 'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'start_turn_restricted',
       'end_latitude', 'end_longitude', 'end_turn_restricted', 'date', 
       'year', 'month', 'day', 'year_month', 'season', 'holiday',
       'vacation', 'weekend', 'time', 'rush_hour', 'road_type_rating',
       'turn_restricted', 'road_distance', 'location_cluster', 'road_lat_long',
       'road_hour', 'road_hour_mean', 'road_day', 'road_day_mean', 'target']]

test = test[['base_date', 'day_of_week', 'base_hour', 'lane_count', 'road_rating',
       'road_name', 'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'start_turn_restricted',
       'end_latitude', 'end_longitude', 'end_turn_restricted', 'date', 
       'year', 'month', 'day', 'year_month', 'season', 'holiday',
       'vacation', 'weekend', 'time', 'rush_hour', 'road_type_rating',
       'turn_restricted', 'road_distance', 'location_cluster', 'road_lat_long',
       'road_hour', 'road_hour_mean', 'road_day', 'road_day_mean']]

# LGBM2

In [None]:
from lightgbm import LGBMRegressor

skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

lgbm_param1 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae'
}


lgbm_pred1 = np.zeros(target.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param1)
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], eval_metric = 'mae')

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = lgbm.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2143
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 21 dense feature groups (96.84 MB) transferred to GPU in 0.140096 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 42.788469
1 Fold MAE = 3.415904807688217
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2142
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



def objective(trial,data=X,target=y):

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2 ,random_state=2023)
    param = {
        'device' : 'gpu',
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1)
    }

    # Generate model
    model = LGBMRegressor(**param)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)], eval_metric = 'mae')
    
    preds = model.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, preds)
    
    return mae

In [None]:
from optuna.samplers import TPESampler

sampler = TPESampler(seed=2023)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-03-01 10:48:13,749][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2023-03-01 10:51:00,994][0m Trial 0 finished with value: 3.2443368268137784 and parameters: {'num_leaves': 12, 'colsample_bytree': 0.9671267355368443, 'reg_alpha': 0.5880522554504738, 'reg_lambda': 1.2659609350429124, 'max_depth': 4, 'n_estimators': 1457, 'min_child_samples': 7, 'subsample': 0.7788820527269581}. Best is trial 0 with value: 3.2443368268137784.[0m
[32m[I 2023-03-01 10:52:16,072][0m Trial 1 finished with value: 3.1789818199687114 and parameters: {'num_leaves': 46, 'colsample_bytree': 0.8634805716431471, 'reg_alpha': 0.456373260425773, 'reg_lambda': 5.013822646640289, 'max_depth': 8, 'n_estimators': 538, 'min_child_samples': 39, 'subsample': 0.4640416013784816}. Best is trial 1 with value: 3.1789818199687114.[0m
[32m[I 2023-03-01 10:53:26,775][0m Trial 2 finished with value: 3.2735497591313423 and parameters: {'num_leaves': 14, 'colsample_bytree': 0.754096984

Best Score: 2.9920208906812245
Best trial: {'num_leaves': 114, 'colsample_bytree': 0.7654675794588895, 'reg_alpha': 0.1871725370949876, 'reg_lambda': 7.297792387589718, 'max_depth': 14, 'n_estimators': 1236, 'min_child_samples': 15, 'subsample': 0.9232009658684728}


In [None]:
sample_submission = pd.read_csv(PATH+"/sample_submission.csv")


params =  {'objective' : 'regression', 'device' : 'gpu', 'metric' : 'mae', 'num_leaves': 114, 'colsample_bytree': 0.7654675794588895, 
           'reg_alpha': 0.1871725370949876, 'reg_lambda': 7.297792387589718, 'max_depth': 14, 'n_estimators': 1236, 'min_child_samples': 15, 
           'subsample': 0.9232009658684728, 'learning_rate' : 0.015
}


skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=2023)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

lgbm_model = {}

for f in range(9):
    print(
        f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    lgbm = LGBMRegressor(**params)
    lgbm.fit(x_train, y_train)

    y_pred = lgbm.predict(x_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{f + 1} Fold MAE = {mae}")
    lgbm_model[f] = lgbm
    print(f'================================================================================\n\n')



for fold in range(9):
    sample_submission['target'] += lgbm_model[fold].predict(test)/9

sample_submission.to_csv(PATH+"/sample_submission_7.csv", index = False)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2141
[LightGBM] [Info] Number of data points in the train set: 4178859, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 21 dense feature groups (95.65 MB) transferred to GPU in 0.092933 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 42.788459
1 Fold MAE = 3.1608015203876887


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2140
[LightGBM] [Info] Number of data points in the train set: 4178859, number of used features: 25
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry

In [None]:
sub4 = pd.read_csv(PATH+"/sample_submission_7.csv")

In [None]:
sub4['target'] = np.round(sub4['target'])

In [None]:
sample_submission = pd.read_csv(PATH+"/sample_submission.csv")
sample_submission['target'] = sub4['target']
sample_submission.to_csv(PATH+"/sample_submission_fin4.csv", index = False)

# XGBoost2

In [None]:
train = train.drop(['base_date', 'date', 'year_month', 'road_type', 'road_rating', 'road_hour', 'road_day', 'start_turn_restricted', 'end_turn_restricted', 'road_name'],axis=1)
test = test.drop(['base_date', 'date', 'year_month', 'road_type', 'road_rating', 'road_hour', 'road_day', 'start_turn_restricted', 'end_turn_restricted', 'road_name'],axis=1)

In [None]:
X = train.drop(['target'], axis = 1)
y = train.target
target = test[X.columns]

In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

xgb_param1 = {
    'objective' : 'reg:absoluteerror',
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}


xgb_pred = np.zeros(target.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param1)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[0]	validation_0-mae:4.29809	validation_1-mae:4.30126
[99]	validation_0-mae:3.25783	validation_1-mae:3.27240
1 Fold MAE = 3.273750218028512
[0]	validation_0-mae:4.29495	validation_1-mae:4.29473
[99]	validation_0-mae:3.20954	validation_1-mae:3.21931
2 Fold MAE = 3.2203810925674614
[0]	validation_0-mae:4.29510	validation_1-mae:4.29426
[82]	validation_0-mae:3.26586	validation_1-mae:3.27240
3 Fold MAE = 3.2722378446445815
[0]	validation_0-mae:4.29525	validation_1-mae:4.29039
[99]	validation_0-mae:3.26033	validation_1-mae:3.26499
4 Fold MAE = 3.2725654191890614
[0]	validation_0-mae:4.29985	validation_1-mae:4.29778
[99]	validation_0-mae:3.22564	validation_1-mae:3.23514
5 Fold MAE = 3.2506923734690143
[0]	validation_0-mae:4.29589	validation_1-mae:4.30170
[99]	validation_0-mae:3.18488	validation_1-mae:3.20379
6 Fold MAE = 3.226307213872144
[0]	validation_0-mae:4.32806	validation_1-mae:4.32366
[79]	validation_0-mae:3.31191	validation_1-mae:3.31908
7 Fold MAE = 3.3191022755795303
[0]	validation_

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



def objective(trial,data=X,target=y):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2 ,random_state=2023)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'lambda': trial.suggest_uniform('lambda', 0.0, 5.0),
        'alpha': trial.suggest_uniform('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.7,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'n_estimators': trial.suggest_categorical('n_estimators', [100,300,500,700,1000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11]),
        'random_state': trial.suggest_categorical('random_state', [2023]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }


    model = XGBRegressor(**param)  
    
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=50,verbose=False)
    
    preds = model.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, preds)
    
    return mae

In [None]:
from optuna.samplers import TPESampler

sampler = TPESampler(seed=2023)
study = optuna.create_study(
    study_name="xgb_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-03-01 10:00:17,088][0m A new study created in memory with name: xgb_parameter_opt[0m
[32m[I 2023-03-01 10:00:45,547][0m Trial 0 finished with value: 3.0354970969960324 and parameters: {'gamma': 1.6099415199598388, 'lambda': 4.452112258947404, 'alpha': 0.5884642031950233, 'colsample_bytree': 0.9, 'subsample': 0.4, 'n_estimators': 300, 'max_depth': 9, 'random_state': 2023, 'min_child_weight': 17}. Best is trial 0 with value: 3.0354970969960324.[0m
[32m[I 2023-03-01 10:01:01,094][0m Trial 1 finished with value: 3.0572209161297192 and parameters: {'gamma': 1.8828189138133906, 'lambda': 0.920270709137832, 'alpha': 0.10484788451657534, 'colsample_bytree': 1.0, 'subsample': 1.0, 'n_estimators': 100, 'max_depth': 9, 'random_state': 2023, 'min_child_weight': 49}. Best is trial 0 with value: 3.0354970969960324.[0m
[32m[I 2023-03-01 10:02:00,168][0m Trial 2 finished with value: 2.9619445368295683 and parameters: {'gamma': 2.0565552716077926, 'lambda': 3.6083220164655994, 'a

Best Score: 2.960761582884222
Best trial: {'gamma': 2.574423762029351, 'lambda': 4.474536589684463, 'alpha': 0.5191467602582522, 'colsample_bytree': 0.9, 'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 9, 'random_state': 2023, 'min_child_weight': 17}


In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

xgb_param1 = {
    'objective' : 'reg:absoluteerror',
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor',
    'gamma': 2.774423762029351, 'lambda': 4.474536589684463, 'alpha': 0.5191467602582522, 
    'colsample_bytree': 0.9, 'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 8, 
    'random_state': 2023, 'min_child_weight': 17
}


xgb_pred = np.zeros(target.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param1)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[0]	validation_0-mae:4.22064	validation_1-mae:4.22653
[153]	validation_0-mae:3.05924	validation_1-mae:3.13286
1 Fold MAE = 3.1577441600265463
[0]	validation_0-mae:4.21466	validation_1-mae:4.21533
[165]	validation_0-mae:3.02530	validation_1-mae:3.10095
2 Fold MAE = 3.1264969518550503
[0]	validation_0-mae:4.25605	validation_1-mae:4.25642
[135]	validation_0-mae:3.04546	validation_1-mae:3.10883
3 Fold MAE = 3.132531555638749
[0]	validation_0-mae:4.26106	validation_1-mae:4.25657
[134]	validation_0-mae:3.04346	validation_1-mae:3.10102
4 Fold MAE = 3.1263544356571273
[0]	validation_0-mae:4.25893	validation_1-mae:4.25825
[160]	validation_0-mae:3.03102	validation_1-mae:3.09802
5 Fold MAE = 3.12178328178643
[0]	validation_0-mae:4.25486	validation_1-mae:4.26159
[139]	validation_0-mae:3.03044	validation_1-mae:3.09979
6 Fold MAE = 3.124229455332871
[0]	validation_0-mae:4.25207	validation_1-mae:4.24976
[149]	validation_0-mae:3.04872	validation_1-mae:3.11170
7 Fold MAE = 3.1345948498474865
[0]	valida

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

sample_submission = pd.read_csv(PATH+"/sample_submission.csv")


params = {'objective' : 'reg:absoluteerror','tree_method' : 'gpu_hist', 
          'predictor' : 'gpu_predictor', 
          'gamma': 2.774423762029351, 'lambda': 4.474536589684463, 'alpha': 0.5191467602582522, 'colsample_bytree': 0.9, 
          'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 8, 'random_state': 2023, 'min_child_weight': 17, 'eta' : 0.01
          }


skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=404)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

XGB_model = {}

for f in range(9):
    print(
        f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    XGB = XGBRegressor(**params)
    XGB.fit(x_train, y_train)

    y_pred = XGB.predict(x_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{f + 1} Fold MAE = {mae}")
    XGB_model[f] = XGB
    print(f'================================================================================\n\n')




for fold in range(9):
    sample_submission['target'] += XGB_model[fold].predict(test)/9

sample_submission.to_csv(PATH+"/sample_submission_6.csv", index = False)

1 Fold MAE = 3.1128873129295096


2 Fold MAE = 3.091132997036504


3 Fold MAE = 3.0814317551326074


4 Fold MAE = 3.085421789258399


5 Fold MAE = 3.0904089965686525


6 Fold MAE = 3.1249602144274147


7 Fold MAE = 3.087142298759932


8 Fold MAE = 3.1130516248685582


9 Fold MAE = 3.112618992178237




In [None]:
sub3 = pd.read_csv(PATH+"/sample_submission_6.csv")

In [None]:
sub3

Unnamed: 0,id,target
0,TEST_000000,26.181024
1,TEST_000001,42.041897
2,TEST_000002,64.818620
3,TEST_000003,36.542410
4,TEST_000004,48.509133
...,...,...
291236,TEST_291236,48.380587
291237,TEST_291237,51.440572
291238,TEST_291238,22.940762
291239,TEST_291239,31.602361


In [None]:
sample_submission = pd.read_csv(PATH+"/sample_submission.csv")
sample_submission['target'] = np.round((sub1['target'] + sub2['target'] + sub3['target']) / 3)
sample_submission.to_csv(PATH+"/sample_submission_fin2.csv", index = False)

# XGBoost 1

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
train = train.drop(['base_date', 'date', 'year_month', 'road_type', 'road_rating', 'road_hour', 'road_day', 'start_turn_restricted', 'end_turn_restricted'],axis=1)
test = test.drop(['base_date', 'date', 'year_month', 'road_type', 'road_rating', 'road_hour', 'road_day', 'start_turn_restricted', 'end_turn_restricted'],axis=1)

In [None]:
X = train.drop(['target'], axis = 1)
y = train.target
target = test[X.columns]

In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

xgb_param1 = {
    'objective' : 'reg:absoluteerror',
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}


xgb_pred = np.zeros(target.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param1)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[0]	validation_0-mae:4.29809	validation_1-mae:4.30126
[48]	validation_0-mae:3.33140	validation_1-mae:3.34027
1 Fold MAE = 3.3402478505579403
[0]	validation_0-mae:4.29495	validation_1-mae:4.29473
[83]	validation_0-mae:3.29789	validation_1-mae:3.30278
2 Fold MAE = 3.302629955628539
[0]	validation_0-mae:4.29519	validation_1-mae:4.29439
[63]	validation_0-mae:3.34215	validation_1-mae:3.34606
3 Fold MAE = 3.345982532193771
[0]	validation_0-mae:4.29522	validation_1-mae:4.29047
[99]	validation_0-mae:3.21398	validation_1-mae:3.21800
4 Fold MAE = 3.2184986024904174
[0]	validation_0-mae:4.29985	validation_1-mae:4.29778
[99]	validation_0-mae:3.23984	validation_1-mae:3.24667
5 Fold MAE = 3.246882724058861
[0]	validation_0-mae:4.29589	validation_1-mae:4.30170
[99]	validation_0-mae:3.23126	validation_1-mae:3.24979
6 Fold MAE = 3.2678219696163975
[0]	validation_0-mae:4.32803	validation_1-mae:4.32370
[99]	validation_0-mae:3.23003	validation_1-mae:3.24134
7 Fold MAE = 3.2413947868851065
[0]	validation_0

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



def objective(trial,data=X,target=y):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2 ,random_state=2023)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'gamma': trial.suggest_uniform('gamma', 0.0, 5.0),
        'lambda': trial.suggest_uniform('lambda', 0.0, 5.0),
        'alpha': trial.suggest_uniform('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.7,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'n_estimators': trial.suggest_categorical('n_estimators', [100,300,500,700,1000]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11]),
        'random_state': trial.suggest_categorical('random_state', [2023]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }


    model = XGBRegressor(**param)  
    
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=50,verbose=False)
    
    preds = model.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, preds)
    
    return mae

In [None]:
from optuna.samplers import TPESampler

sampler = TPESampler(seed=2023)
study = optuna.create_study(
    study_name="xgb_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-03-01 06:58:57,887][0m A new study created in memory with name: xgb_parameter_opt[0m
[32m[I 2023-03-01 06:59:28,924][0m Trial 0 finished with value: 3.032706913212054 and parameters: {'gamma': 1.6099415199598388, 'lambda': 4.452112258947404, 'alpha': 0.5884642031950233, 'colsample_bytree': 0.9, 'subsample': 0.4, 'n_estimators': 300, 'max_depth': 9, 'random_state': 2023, 'min_child_weight': 17}. Best is trial 0 with value: 3.032706913212054.[0m
[32m[I 2023-03-01 06:59:45,631][0m Trial 1 finished with value: 3.052021485796651 and parameters: {'gamma': 1.8828189138133906, 'lambda': 0.920270709137832, 'alpha': 0.10484788451657534, 'colsample_bytree': 1.0, 'subsample': 1.0, 'n_estimators': 100, 'max_depth': 9, 'random_state': 2023, 'min_child_weight': 49}. Best is trial 0 with value: 3.032706913212054.[0m
[32m[I 2023-03-01 07:00:54,619][0m Trial 2 finished with value: 2.9570779279179633 and parameters: {'gamma': 2.0565552716077926, 'lambda': 3.6083220164655994, 'alpha

Best Score: 2.9570779279179633
Best trial: {'gamma': 2.0565552716077926, 'lambda': 3.6083220164655994, 'alpha': 0.6636241929424768, 'colsample_bytree': 1.0, 'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 11, 'random_state': 2023, 'min_child_weight': 50}


In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

xgb_param1 = {
    'objective' : 'reg:absoluteerror','tree_method' : 'gpu_hist', 
    'predictor' : 'gpu_predictor', 'gamma': 2.0565552716077926, 'lambda': 3.6083220164655994, 'alpha': 0.6636241929424768, 
    'colsample_bytree': 1.0, 'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 8, 'random_state': 2023, 'min_child_weight': 50, 'eta' : 0.015
}


xgb_pred = np.zeros(target.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param1)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

sample_submission = pd.read_csv(PATH+"/sample_submission.csv")


params = {'objective' : 'reg:absoluteerror','tree_method' : 'gpu_hist', 
          'predictor' : 'gpu_predictor', 'gamma': 2.0565552716077926, 'lambda': 3.6083220164655994, 'alpha': 0.6636241929424768, 
          'colsample_bytree': 1.0, 'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 8, 'random_state': 2023, 'min_child_weight': 50, 'eta' : 0.015
          }


skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=404)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

XGB_model = {}

for f in range(9):
    print(
        f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    XGB = XGBRegressor(**params)
    XGB.fit(x_train, y_train)

    y_pred = XGB.predict(x_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{f + 1} Fold MAE = {mae}")
    XGB_model[f] = XGB
    print(f'================================================================================\n\n')




for fold in range(9):
    sample_submission['target'] += XGB_model[fold].predict(test)/9

sample_submission.to_csv(PATH+"/sample_submission_4.csv", index = False)

1 Fold MAE = 3.0772075207821974


2 Fold MAE = 3.0763008442294355


3 Fold MAE = 3.0688078435139845


4 Fold MAE = 3.0693848314425822


5 Fold MAE = 3.0715137126931378


6 Fold MAE = 3.0769248957925694


7 Fold MAE = 3.0805428913244626


8 Fold MAE = 3.071684182762709


9 Fold MAE = 3.0727304044988126




In [None]:
sub1 = pd.read_csv(PATH+"/sample_submission_4.csv")

In [None]:
sub1

Unnamed: 0,id,target
0,TEST_000000,26.288453
1,TEST_000001,41.951781
2,TEST_000002,61.402993
3,TEST_000003,37.561128
4,TEST_000004,50.149511
...,...,...
291236,TEST_291236,49.141225
291237,TEST_291237,49.930929
291238,TEST_291238,22.329930
291239,TEST_291239,31.655704


# LGBM1

In [None]:
from lightgbm import LGBMRegressor

skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

lgbm_param1 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae'
}


lgbm_pred1 = np.zeros(target.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param1)
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], eval_metric = 'mae')

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = lgbm.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2204
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (96.84 MB) transferred to GPU in 0.108513 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 42.788469
1 Fold MAE = 3.4196846775943266
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2203
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 

In [None]:
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



def objective(trial,data=X,target=y):

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2 ,random_state=2023)
    param = {
        'device' : 'gpu',
        'objective': 'regression',
        'verbose': -1,
        'metric': 'mae', 
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1)
    }

    # Generate model
    model = LGBMRegressor(**param)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)], eval_metric = 'mae')
    
    preds = model.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, preds)
    
    return mae

In [None]:
from optuna.samplers import TPESampler

sampler = TPESampler(seed=2023)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

In [None]:
from lightgbm import LGBMRegressor

skf = StratifiedKFold(n_splits = 10, random_state = 2023, shuffle = True)

lgbm_param1 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae',
    'num_leaves': 114, 'colsample_bytree': 0.7654675794588895, 'reg_alpha': 0.1871725370949876, 'reg_lambda': 7.297792387589718, 'max_depth': 14, 'n_estimators': 1236, 'min_child_samples': 15, 'subsample': 0.9232009658684728
}


lgbm_pred1 = np.zeros(target.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X, y):
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param1)
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], eval_metric = 'mae')

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = lgbm.predict(target) / skf.n_splits
    xgb_pred += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2204
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (96.84 MB) transferred to GPU in 0.103469 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 42.788469
1 Fold MAE = 3.0350504762593538
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2203
[LightGBM] [Info] Number of data points in the train set: 4231095, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 

KeyboardInterrupt: ignored

In [None]:
sample_submission = pd.read_csv(PATH+"/sample_submission.csv")


params =  {'objective' : 'regression', 'device' : 'gpu', 'metric' : 'mae', 'num_leaves': 114, 'colsample_bytree': 0.7654675794588895, 
           'reg_alpha': 0.1871725370949876, 'reg_lambda': 7.297792387589718, 'max_depth': 14, 'n_estimators': 1236, 'min_child_samples': 15, 
           'subsample': 0.9232009658684728, 'learning_rate' : 0.015
}


skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=404)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

lgbm_model = {}

for f in range(9):
    print(
        f'===================================={f+1}============================================')
    train_idx, val_idx = folds[f]

    x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]

    lgbm = LGBMRegressor(**params)
    lgbm.fit(x_train, y_train)

    y_pred = lgbm.predict(x_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{f + 1} Fold MAE = {mae}")
    lgbm_model[f] = lgbm
    print(f'================================================================================\n\n')



for fold in range(9):
    sample_submission['target'] += lgbm_model[fold].predict(test)/9

sample_submission.to_csv(PATH+"/sample_submission_5.csv", index = False)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2204
[LightGBM] [Info] Number of data points in the train set: 4178859, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (95.65 MB) transferred to GPU in 0.100053 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 42.788459
1 Fold MAE = 3.1594831283568756


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2200
[LightGBM] [Info] Number of data points in the train set: 4178859, number of used features: 26
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry

In [None]:
sub2 = pd.read_csv(PATH+"/sample_submission_5.csv")

In [None]:
sample_submission = pd.read_csv(PATH+"/sample_submission.csv")

In [None]:
sample_submission['target'] = np.round((sub1['target'] + sub2['target']) / 2)

In [None]:
sample_submission.to_csv(PATH+"/sample_submission_fin.csv", index = False)