https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

In [None]:
nb_path = '/content/dataset'
os.symlink('/content/drive/MyDrive/dacon/2022_jeju/dataset', nb_path)
sys.path.insert(0, nb_path)

## Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.metrics import mean_absolute_error
import seaborn as sns
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 500)
import tqdm


## csv to parquet
#### -> 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업이 가능합니다.

In [None]:
# def csv_to_parquet(csv_path, save_name):
#     df = pd.read_csv(csv_path)
#     df.to_parquet(f'dataset/{save_name}.parquet')
#     del df
#     gc.collect()
#     print(save_name, 'Done.')

In [None]:
# csv_to_parquet('dataset/train.csv', 'train')
# csv_to_parquet('dataset/test.csv', 'test')

## Config

In [None]:
class cfg:
    def __getitem__(self,key):
        return getattr(self, key)
    def __setitem__(self,key,value):
        return setattr(self, key, value)

    FOLDS = 5
    SEED = 42
    VER = 1.1
    lr = 1e-3
    epochs = 200
    
    batch_size = 16

    ##############
    backbone = 'XGB'
    
CFG = cfg()

## 데이터 불러오기

In [None]:
train = pd.read_parquet('dataset/train.parquet')
test = pd.read_parquet('dataset/test.parquet')

In [None]:
def process_and_feature_engineer(train, test):
    cat_features = ['day_of_week','road_name', 'start_node_name', 'end_node_name', 'start_turn_restricted','end_turn_restricted']
    num_features = train.columns.difference(cat_features)

    for i in cat_features:
        le = LabelEncoder()
        le=le.fit(train[i])
        train[i]=le.transform(train[i])
    
        for label in np.unique(test[i]):
            if label not in le.classes_: 
                le.classes_ = np.append(le.classes_, label)
        test[i]=le.transform(test[i])

    train = train.drop(['base_date',
                        'road_name', 
                        'start_node_name', 
                        'end_node_name',
                        'vehicle_restricted', 
                        'height_restricted'], axis=1)
    # test = test.drop(['base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted', 'height_restricted'], axis=1)

    return train, test

In [None]:
train, test = process_and_feature_engineer(train, test)
print(train.shape, test.shape)

(4701217, 17) (291241, 22)


In [None]:
FEATURES = train.columns[1:-1]

In [None]:
FEATURES

Index(['day_of_week', 'base_hour', 'lane_count', 'road_rating', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'start_turn_restricted',
       'end_latitude', 'end_longitude', 'end_turn_restricted'],
      dtype='object')

## 데이터 전처리

In [None]:
# y_train = train['target'] 

# X_train = train.drop(['id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

# test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

# print(X_train.shape)
# print(y_train.shape)
# print(test.shape)

## 모델 선언 및 학습

In [None]:
# LOAD XGB LIBRARY
from sklearn.model_selection import KFold
import xgboost as xgb
print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'mae',
    'objective':'reg:squarederror',
    'random_state':CFG['SEED'],
}

XGB Version 1.6.2


In [None]:
TRAIN_SUBSAMPLE = 1.0
oof = []
skf = KFold(n_splits=CFG['FOLDS'], shuffle=True, random_state=CFG['SEED'])
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
    # TRAIN WITH SUBSAMPLE OF TRAIN FOLD DATA
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(CFG['SEED'])
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, 'target']
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']

    model = xgb.XGBRegressor(
        max_depth=8,
        learning_rate=0.1,
        subsample=.8,
        colsample_bytree = .6,
        eval_metric = 'mae',
        objective = 'reg:squarederror',
        random_state=CFG['SEED'],
        # num_boost_round=9999,
        #         verbose_eval=100
        verbosity = 2, 
        n_estimators = 3000,
        early_stopping_rounds = 100,
    )
    model.fit(X_train, y_train,
        verbose=True,
        eval_set=[([X_valid, y_valid])]
)
    model.save_model(f"XGB_v{CFG['VER']}_fold{fold}.xgb")
    oof_preds = model.predict(X_valid)
    print(mean_absolute_error(y_valid, oof_preds))

    # oof 결과 append
    df = train.loc[valid_idx, ['id', 'target']].copy()
    df['oof_pred'] = oof_preds
    oof.append(df)

oof = pd.concat(oof, axis=0, ignore_index=True).set_index('id')


#########################
### Fold 1
### Train size 3760973 Valid size 940244
### Training with 100% fold data...
#########################
[15:52:30] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 280 extra nodes, 0 pruned nodes, max_depth=8
[0]	validation_0-mae:38.06679
[15:52:31] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 418 extra nodes, 0 pruned nodes, max_depth=8
[1]	validation_0-mae:34.26291
[15:52:32] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 286 extra nodes, 0 pruned nodes, max_depth=8
[2]	validation_0-mae:30.84678
[15:52:33] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:10

In [None]:
oof

Unnamed: 0_level_0,target,oof_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
TRAIN_0000000,52.0,49.688110
TRAIN_0000002,61.0,66.371490
TRAIN_0000008,14.0,25.421900
TRAIN_0000012,60.0,48.141293
TRAIN_0000013,28.0,39.093857
...,...,...
TRAIN_4701195,62.0,45.227528
TRAIN_4701201,31.0,57.743927
TRAIN_4701204,55.0,50.751095
TRAIN_4701207,51.0,37.401199


## 추론

In [None]:
import tqdm

In [None]:
X_test = test[FEATURES]

model = xgb.XGBRegressor()
model.load_model(f"XGB_v1_fold0.xgb")
preds = model.predict(X_test)
for f in tqdm.trange(1, CFG['FOLDS']):
    model.load_model(f"XGB_v1_fold{f}.xgb")
    preds += model.predict(X_test)
preds /= CFG['FOLDS']

100%|██████████| 3/3 [00:36<00:00, 12.23s/it]


## Submission

In [None]:
sample_submission = pd.read_csv('dataset/sample_submission.csv')

In [None]:
sample_submission['target'] = preds
sample_submission.to_csv("./submit_xgboost_v1.csv", index = False)

In [None]:
sample_submission

Unnamed: 0,id,target
0,TEST_000000,24.212671
1,TEST_000001,44.725983
2,TEST_000002,64.655403
3,TEST_000003,38.077053
4,TEST_000004,44.934578
...,...,...
291236,TEST_291236,47.429817
291237,TEST_291237,51.745987
291238,TEST_291238,22.799397
291239,TEST_291239,24.035179
