In [1]:
import hydra
import re
import wandb
import pandas as pd
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from omegaconf import DictConfig, OmegaConf
from typing import List

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GroupKFold
import lightgbm

from utils import seed_everything, AverageMeter
from feature_block import run_block, NumericBlock, LabelEncodingBlock, CountEncodingBlock, AggBlock


GBDT_DIR = Path.cwd()
GBDT_ID =  Path.cwd().name
ROOT_DIR = GBDT_DIR.parents[2]

DATA_DIR = ROOT_DIR / 'data'
ORIGINAL_DATA_DIR = DATA_DIR / 'original_data/atmaCup#18_dataset'
CREATED_DATA_DIR = DATA_DIR / 'created_data'

OUTPUT_DIR = ROOT_DIR / 'outputs'

SAVE_DIR = OUTPUT_DIR / 'gbdt' / GBDT_ID
SAVE_DIR.mkdir(exist_ok=True, parents=True)

WANDB_DIR = SAVE_DIR / 'wandb'
WANDB_DIR.mkdir(parents=True, exist_ok=True)

ID_COLUMNS = ['ID']
META_COLUMNS = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brakePressed', 'gas', 'gasPressed', 'gearShifter', 'leftBlinker', 'rightBlinker']
TARGET_COLUMNS = ['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5']

def split_data(cfg, df):
    scene_ser = df['ID'].apply(lambda x: x.split('_')[0])

    df['fold'] = -1
    group_kfold = GroupKFold(n_splits=cfg.n_folds)
    for ifold, (_, valid_index) in enumerate(group_kfold.split(df, groups=scene_ser)):
        df.loc[valid_index, 'fold'] = ifold
    return df

def mae(gt: np.array, pred: np.array):
    abs_diff = np.abs(gt - pred)
    score = np.mean(abs_diff.reshape(-1, ))
    return float(score)

In [2]:
raw_train_df = pd.read_csv(ORIGINAL_DATA_DIR / 'train_features.csv')
raw_test_df = pd.read_csv(ORIGINAL_DATA_DIR / 'test_features.csv')
ss_df = pd.read_csv(ORIGINAL_DATA_DIR / 'atmaCup18__sample_submit.csv')

In [3]:
# boolのcolをintに変換
# scene, scene_sec, scene_countを追加
def common_preprocess(target_df: pd.DataFrame) -> pd.DataFrame:
    '''
    処理
    ----
    - boolのcolをintに変換
    - scene, scene_sec, scene_countを追加
    '''
    num_cols = []
    
    # brake消す
    if 'brake' in target_df.columns:
        target_df.drop('brake', axis=1, inplace=True)
    
    # boolのcol
    bool_columns = ['brakePressed', 'gasPressed', 'leftBlinker', 'rightBlinker']
    target_df[bool_columns] = target_df[bool_columns].astype(int)

    target_df['scene'] = target_df['ID'].str.split('_').str[0]
    target_df['scene_sec'] = target_df['ID'].str.split('_').str[1].astype(int)

    target_df['ori_idx'] = target_df.index
    
    # sceneでsort
    target_df.sort_values(by=['scene', 'scene_sec'], inplace=True)
    # 1. sceneの特徴量
    count_df = target_df.groupby('scene').size()
    target_df['scene_count'] = target_df['scene'].map(count_df)
    
    scene_sec_from_zero = target_df.groupby('scene').apply(lambda x:x['scene_sec'] - x['scene_sec'].min()).reset_index()['scene_sec'].values
    target_df['scene_sec_from_zero'] = scene_sec_from_zero
    target_df['scene_sec_rank'] = target_df.groupby('scene')['scene_sec'].rank(method='first').astype(int)
    
    num_cols.append(['scene_sec', 'scene_count', 'scene_sec_from_zero', 'scene_sec_rank'])
    
    # 2. steeringAngleDeg を度からラジアンに変換
    target_df["steeringAngleRad"] = np.deg2rad(target_df["steeringAngleDeg"])
    num_cols.append("steeringAngleRad")

    # 3. 三角関数の特徴量を作成
    target_df["steeringAngle_sin"] = np.sin(target_df["steeringAngleRad"])
    target_df["steeringAngle_cos"] = np.cos(target_df["steeringAngleRad"])
    num_cols.extend(["steeringAngle_sin", "steeringAngle_cos"])

    # 4. 交互作用特徴量を作成
    target_df["speed_steering"] = target_df["vEgo"] * target_df["steeringAngleRad"]  # 速度とステアリング角度の組み合わせ
    target_df["acc_steeringTorque"] = target_df["aEgo"] * target_df["steeringTorque"]  # 加速度とステアリングトルクの組み合わせ
    num_cols.extend(["speed_steering", "acc_steeringTorque"])

    # 5. 対数変換
    target_df["vEgo_positive"] = target_df["vEgo"].clip(lower=0) + 1e-6
    target_df["log_vEgo"] = np.log(target_df["vEgo_positive"])
    num_cols.append("log_vEgo")

    # 6. 加速度の変化率
    target_df["jerk"] = target_df.groupby("scene")["aEgo"].diff()
    num_cols.append("jerk")

    # 7. ステアリング角度とトルクの変化率
    target_df["steeringAngleRate"] = target_df.groupby("scene")["steeringAngleRad"].diff()
    target_df["steeringTorqueRate"] = target_df.groupby("scene")["steeringTorque"].diff()
    num_cols.extend(["steeringAngleRate", "steeringTorqueRate"])

    # 8. 二乗・絶対値特徴量
    target_df["vEgo_squared"] = target_df["vEgo"] ** 2
    target_df["steeringAngleRad_squared"] = target_df["steeringAngleRad"] ** 2
    target_df["aEgo_squared"] = target_df["aEgo"] ** 2
    num_cols.extend(["vEgo_squared", "steeringAngleRad_squared", "aEgo_squared"])

    # 9. 移動平均や移動和
    target_df["vEgo_roll_mean"] = target_df.groupby("scene")["vEgo"].rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
    target_df["aEgo_roll_mean"] = target_df.groupby("scene")["aEgo"].rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
    num_cols.extend(["vEgo_roll_mean", "aEgo_roll_mean"])
    
    # IDでsortしなおす
    target_df = target_df.sort_values('ori_idx').reset_index(drop=True)
    target_df = target_df.drop('ori_idx', axis=1)
    
    return target_df, num_cols

In [7]:
train_df, _ = common_preprocess(raw_train_df)
test_df, _ = common_preprocess(raw_test_df)
display(train_df)
train_df.iloc[:50].to_csv('tmp.csv')

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brakePressed,gas,gasPressed,gearShifter,leftBlinker,...,vEgo_positive,log_vEgo,jerk,steeringAngleRate,steeringTorqueRate,vEgo_squared,steeringAngleRad_squared,aEgo_squared,vEgo_roll_mean,aEgo_roll_mean
0,00066be8e20318869c38c66be466631a_320,5.701526e+00,1.538456e+00,-2.165777,-139.0,0,0.250,1,drive,0,...,5.701527,1.740734,,,,3.250740e+01,0.001429,2.366846e+00,5.701526e+00,1.538456e+00
1,00066be8e20318869c38c66be466631a_420,1.117629e+01,2.798807e-01,-11.625697,-44.0,0,0.000,0,drive,0,...,11.176293,2.413795,-1.258575e+00,-0.165107,95.0,1.249095e+02,0.041171,7.833322e-02,8.438909e+00,9.091682e-01
2,00066be8e20318869c38c66be466631a_520,1.047255e+01,2.310992e-01,-2.985105,-132.0,0,0.180,1,drive,0,...,10.472549,2.348757,-4.878157e-02,0.150807,-88.0,1.096743e+02,0.002714,5.340682e-02,1.082442e+01,2.554899e-01
3,000fb056f97572d384bae4f5fc1e0f28_120,6.055565e+00,-1.177754e-01,7.632668,173.0,0,0.000,0,drive,0,...,6.055566,1.800978,-1.394509e+00,0.686929,287.0,3.666987e+01,0.017746,1.387104e-02,4.686155e+00,5.794789e-01
4,000fb056f97572d384bae4f5fc1e0f28_20,3.316744e+00,1.276733e+00,-31.725477,-114.0,0,0.255,1,drive,0,...,3.316745,1.198984,,,,1.100079e+01,0.306599,1.630048e+00,3.316744e+00,1.276733e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43366,fff88cca5f8a012427b96bdde66011e3_20,-0.000000e+00,-0.000000e+00,16.102747,-1.0,1,0.000,0,drive,0,...,0.000001,-13.815511,,,,0.000000e+00,0.078987,0.000000e+00,-0.000000e+00,-0.000000e+00
43367,fff88cca5f8a012427b96bdde66011e3_220,1.594968e+00,3.501294e-01,-5.791823,-97.0,0,0.000,0,drive,0,...,1.594969,0.466854,3.501294e-01,-0.382826,-96.0,2.543922e+00,0.010218,1.225906e-01,7.974839e-01,1.750647e-01
43368,fff88cca5f8a012427b96bdde66011e3_320,-4.207162e-03,-3.783329e-02,-2.089301,0.0,1,0.000,0,drive,0,...,0.000001,-13.815511,-3.879627e-01,0.064621,97.0,1.770021e-05,0.001330,1.431358e-03,7.953803e-01,1.561481e-01
43369,fff88cca5f8a012427b96bdde66011e3_420,-2.162400e-10,-1.943228e-09,-2.170106,1.0,1,0.000,0,drive,0,...,0.000001,-13.815511,3.783329e-02,-0.001410,1.0,4.675973e-20,0.001435,3.776137e-18,-2.103581e-03,-1.891665e-02


In [8]:
# shift特徴量を追加
def make_shift_feature(target_df, use_feat_columns):
    shift_count = 5
    shift_diff_count = 1
    shift_range = list(range(-shift_count, shift_count+1))
    shift_range = [x for x in shift_range if x != 0]
    shift_diff_range = list(range(-shift_count, shift_count+1))
    shift_diff_range = [x for x in shift_diff_range if x != 0]

    target_df['ori_idx'] = target_df.index

    target_df = target_df.sort_values(['scene', 'scene_sec']).reset_index(drop=True)

    shift_feat_columns = []
    for shift in shift_range:
        for col in use_feat_columns:
            shift_col = f'{col}_shift{shift}'
            target_df[shift_col] = target_df.groupby('scene')[col].shift(shift)
            shift_feat_columns.append(shift_col)
    
    for shift in shift_diff_range:
        for col in use_feat_columns:
            diff_col = f'{col}_diff{shift}'
            target_df[diff_col] = target_df[col] - target_df[shift_col]
            shift_feat_columns.append(diff_col)

    target_df = target_df.sort_values('ori_idx').reset_index(drop=True)
    target_df = target_df.drop('ori_idx', axis=1)

    return target_df, shift_feat_columns

In [9]:
train_df, shift_columns = make_shift_feature(train_df, ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brakePressed', 'gas', 'gasPressed',  'leftBlinker', 'rightBlinker'])
test_df, shift_columns = make_shift_feature(test_df, ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brakePressed', 'gas', 'gasPressed',  'leftBlinker', 'rightBlinker'])

display(train_df)
train_df.iloc[:50].to_csv('tmp.csv')

Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brakePressed,gas,gasPressed,gearShifter,leftBlinker,...,rightBlinker_diff4,vEgo_diff5,aEgo_diff5,steeringAngleDeg_diff5,steeringTorque_diff5,brakePressed_diff5,gas_diff5,gasPressed_diff5,leftBlinker_diff5,rightBlinker_diff5
0,00066be8e20318869c38c66be466631a_320,5.701526e+00,1.538456e+00,-2.165777,-139.0,0,0.250,1,drive,0,...,,,,,,,,,,
1,00066be8e20318869c38c66be466631a_420,1.117629e+01,2.798807e-01,-11.625697,-44.0,0,0.000,0,drive,0,...,,,,,,,,,,
2,00066be8e20318869c38c66be466631a_520,1.047255e+01,2.310992e-01,-2.985105,-132.0,0,0.180,1,drive,0,...,,,,,,,,,,
3,000fb056f97572d384bae4f5fc1e0f28_120,6.055565e+00,-1.177754e-01,7.632668,173.0,0,0.000,0,drive,0,...,,,,,,,,,,
4,000fb056f97572d384bae4f5fc1e0f28_20,3.316744e+00,1.276733e+00,-31.725477,-114.0,0,0.255,1,drive,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43366,fff88cca5f8a012427b96bdde66011e3_20,-0.000000e+00,-0.000000e+00,16.102747,-1.0,1,0.000,0,drive,0,...,,,,,,,,,,
43367,fff88cca5f8a012427b96bdde66011e3_220,1.594968e+00,3.501294e-01,-5.791823,-97.0,0,0.000,0,drive,0,...,,,,,,,,,,
43368,fff88cca5f8a012427b96bdde66011e3_320,-4.207162e-03,-3.783329e-02,-2.089301,0.0,1,0.000,0,drive,0,...,,,,,,,,,,
43369,fff88cca5f8a012427b96bdde66011e3_420,-2.162400e-10,-1.943228e-09,-2.170106,1.0,1,0.000,0,drive,0,...,,,,,,,,,,


In [19]:
# 信号機に関する特徴量を追加
def add_traffic_light_feature(
        train_df: pd.DataFrame,
        test_df: pd.DataFrame
    ) -> pd.DataFrame:
    '''
    処理
    ----
    - 信号機の数をを追加 (jsonの中のlistの長さ)
    '''
    traffic_lights_df = pd.read_csv(CREATED_DATA_DIR / 'data0005' / 'traffic_light.csv')
    
    # classという名前があれなのでclass_nameに変える
    traffic_lights_df.rename(columns={'class': 'class_name'}, inplace=True)

    traffic_lights_df['bbox_c_x'] = traffic_lights_df.apply(lambda x:(x['bbox_2'] + x['bbox_0']) / 2 , axis=1)
    traffic_lights_df['bbox_c_y'] = traffic_lights_df.apply(lambda x:(x['bbox_3'] + x['bbox_1']) / 2 , axis=1)
    traffic_lights_df['bbox_aspect'] = traffic_lights_df.apply(lambda x:(x['bbox_2'] - x['bbox_0']) / (x['bbox_3'] - x['bbox_1']) , axis=1)
    traffic_lights_df['bbox_area'] = traffic_lights_df.apply(lambda x:(x['bbox_2'] - x['bbox_0']) * (x['bbox_3'] - x['bbox_1']), axis=1)

    # 面積が30以上のものを削除
    traffic_lights_df = traffic_lights_df.query('bbox_area < 30').reset_index(drop=True)

    # 信号の数
    tl_count = traffic_lights_df.groupby(['ID']).size().reset_index().rename(columns={0: 'n_traffic_lights'})
    traffic_lights_df = traffic_lights_df.merge(tl_count, on='ID')

    # bboxをstrにして一意に
    traffic_lights_df['bbox_str'] = traffic_lights_df.apply(lambda x:f'[{x["bbox_0"]:.3f}, {x["bbox_1"]:.3f}, {x["bbox_2"]:.3f}, {x["bbox_3"]:.3f}]', axis=1)

    # id, bbox, classでsort
    traffic_lights_df.sort_values(by=['ID', 'bbox_str', 'class_name'], inplace=True)

    # 一つの信号機に対するclassの組み合わせを取得
    same_tl_df = traffic_lights_df.groupby(['ID', 'bbox_str'])['class_name'].unique().reset_index().rename(columns={'class_name': 'class_unique'})

    # 一つの信号機に対するclassの組み合わせの個数を取得
    same_tl_size_df = traffic_lights_df.groupby(['ID', 'bbox_str'])['class_name'].nunique().reset_index().rename(columns={'class_name': 'n_signs'})

    # 一つの信号の情報をマージ
    traffic_lights_df = traffic_lights_df.merge(same_tl_df, on=['ID', 'bbox_str'])
    traffic_lights_df = traffic_lights_df.merge(same_tl_size_df, on=['ID', 'bbox_str'])

    # IDに対して最大の面積の信号のみ使う
    area_df = traffic_lights_df.drop_duplicates(['ID', 'bbox_str'])[['ID', 'bbox_str', 'bbox_area']]
    area_df = area_df.groupby(['ID'])[['bbox_area']].rank(ascending=False).astype(int)
    traffic_lights_df = traffic_lights_df.loc[area_df.query('bbox_area == 1').index]
    
    # 必要な特徴量に厳選
    tl_feature_df = traffic_lights_df[['ID', 'bbox_c_x', 'bbox_c_y', 'bbox_aspect', 'bbox_area', 'n_signs', 'n_traffic_lights', 'class_unique']].copy()
    # 最大面積の信号機の各信号を特徴量に
    sign_columns = ['green', 'yellow', 'red', 'straight', 'left', 'right', 'empty', 'other']
    tl_feature_df.reset_index(drop=True, inplace=True)
    
    # 各信号があるかをチェック
    tl_feature_df[[f'sign_{c}' for c in sign_columns]] = 0
    for i, row in tqdm(tl_feature_df.iterrows(), total=len(tl_feature_df)):
        for class_name in row['class_unique']:
            tl_feature_df.loc[i, f'sign_{class_name}'] = 1
    tl_feature_df.drop('class_unique', axis=1, inplace=True)

    traffic_columns = [c for c in tl_feature_df.columns if c != 'ID']
    
    train_df = pd.merge(train_df, tl_feature_df, on='ID', how='left')
    test_df = pd.merge(test_df, tl_feature_df, on='ID', how='left')
    return train_df, test_df, traffic_columns

In [21]:
train_df, test_df, traffic_columns = add_traffic_light_feature(train_df, test_df)

100% 6930/6930 [00:01<00:00, 5748.91it/s]


In [23]:
# oofの特徴量を追加
def add_oof_feature(
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        img_oof_paths: List[Path],
        img_submissions_paths: List[Path],
        oof_feature: bool = False
    ):
    '''
    処理
    ----
    - oof_dfの特徴量を追加
    '''
    assert len(img_oof_paths) == len(img_submissions_paths), 'len(img_oof_paths) != len(img_submissions_paths)'
    
    oof_feat_columns = []
    for img_oof_path, img_submission_path in zip(img_oof_paths, img_submissions_paths):
        img_oof_df = pd.read_csv(img_oof_path, index_col=0)
        img_oof_name = img_oof_path.parent.name

        _oof_feat_columns  = [f'{img_oof_name}_{c}' for c in TARGET_COLUMNS]
        pred_columns = [f'pred_{i}' for i in TARGET_COLUMNS]

        if oof_feature:
            feature_columns = [c for c in img_oof_df.columns if re.search('^feature_', c)]
            _oof_feat_columns += [f'{img_oof_name}_{c}' for c in feature_columns]
            pred_columns += feature_columns

        img_oof_df.sort_values(by='ID', inplace=True)
        img_oof_df.reset_index(drop=True, inplace=True)
        assert train_df.shape[0] == img_oof_df.shape[0], f'train_df.shape[0] ({train_df.shape[0]}) != img_oof_df.shape[0] ({img_oof_df.shape[0]})'
        train_df[_oof_feat_columns] = img_oof_df[pred_columns]

        img_submission_df = pd.read_csv(img_submission_path)
        target_columns = TARGET_COLUMNS.copy()
        if oof_feature:
            # feature_columns = [c for c in img_submission_df.columns if re.search('^feature_', c)]
            # _oof_feat_columns += [f'{img_oof_name}_{c}' for c in feature_columns]
            target_columns += feature_columns
        
        test_df[_oof_feat_columns] = img_submission_df[target_columns]

        oof_feat_columns.extend(_oof_feat_columns)
    
    return train_df, test_df, oof_feat_columns


In [6]:
def add_feature_block(
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        num_columns: List[str] = [],
        agg_num_columns: List[str] = [],
        cat_label_columns: List[str] = [],
        cat_count_columns: List[str] = [],
        cat_te_columns: List[str] = [],
    ):
    '''
    処理
    ----
    - feature_blocksの処理を実行
    '''
    train_num = len(train_df)

    # ======= train_df, test_dfを結合して処理 =======
    whole_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

    blocks = [
        *[NumericBlock(col) for col in num_columns],
        *[LabelEncodingBlock(col) for col in cat_label_columns],
        *[CountEncodingBlock(col) for col in cat_count_columns],
        # *[AggBlock(group_col, target_columns=agg_num_columns,
        #            agg_columns=['mean', 'max', 'min', 'std']) for group_col in ['scene']],
    ]
    whole_feat_df = run_block(whole_df, blocks, is_fit=True)

    # ======= train_df, test_df 別々に処理 =======

    train_df, test_df = whole_df.iloc[:train_num], whole_df.iloc[train_num:].drop(
        columns=TARGET_COLUMNS).reset_index(drop=True)
    train_feat, test_feat = whole_feat_df.iloc[:train_num], whole_feat_df.iloc[train_num:].reset_index(
        drop=True)

    blocks = [
        # *[TargetEncodingBlock(col, TARGET_COLUMNS) for col in cat_te_columns]
    ]

    _df = run_block(train_df, blocks, is_fit=True)
    train_feat = pd.concat([train_feat, _df], axis=1)
    _df = run_block(test_df, blocks, is_fit=False)
    test_feat = pd.concat([test_feat, _df], axis=1)

    return train_df, test_df, train_feat, test_feat

In [7]:
## ====================================================

# gbdtモデル
class LightGBM:
    def __init__(
            self,
            lgb_params,
            save_dir=None,
            categorical_feature=None,
            model_name='lgb',
            stopping_rounds=50
        ) -> None:

        self.save_dir = save_dir
        self.lgb_params = lgb_params
        self.categorical_feature = categorical_feature

        # saveの切り替え用
        self.model_name = model_name

        self.stopping_rounds = stopping_rounds

    def fit(self, x_train, y_train, **fit_params) -> None:

        X_val, y_val = fit_params['eval_set'][0]
        del fit_params['eval_set']

        train_dataset = lightgbm.Dataset(
            x_train, y_train, categorical_feature=self.categorical_feature)

        val_dataset = lightgbm.Dataset(
            X_val, y_val, categorical_feature=self.categorical_feature)

        self.model = lightgbm.train(
            params=self.lgb_params,
            train_set=train_dataset,
            valid_sets=[train_dataset, val_dataset],
            callbacks=[lightgbm.early_stopping(stopping_rounds=self.stopping_rounds,
                                            verbose=True),
                        lightgbm.log_evaluation(500)],
            **fit_params
        )

    def save(self, fold):
        save_to = self.save_dir / f'lgb_fold_{fold}_{self.model_name}.txt'
        self.model.save_model(save_to)

    def predict(self, x):
        return self.model.predict(x)

    def predict_proba(self, x):
        return self.model.predict_proba(x)


def get_model(
        cfg,
        model_name
    ):
    lgb_params = {
        'objective': 'regression',
        'boosting_type': cfg.boosting_type,
        'verbose': -1,
        'n_jobs': 8,
        'seed': cfg.seed,
        'learning_rate': cfg.learning_rate,
        # 'num_class': CFG.num_class, # multiclassなら必要
        'metric': 'mae',
        'num_leaves': cfg.num_leaves,
        'max_depth': cfg.max_depth,
        'subsample': cfg.subsample,
        'colsample_bytree': cfg.colsample_bytree,
        'min_data_in_leaf': cfg.min_data_in_leaf,
        'bagging_seed': cfg.seed,
        'feature_fraction_seed': cfg.seed,
        'drop_seed': cfg.seed,
    }
    save_log_dir = SAVE_DIR / 'log'
    save_log_dir.mkdir(exist_ok=True, parents=True)

    model = LightGBM(
                lgb_params=lgb_params,
                save_dir=save_log_dir,
                model_name=model_name
    )

    return model

def get_fit_params(cfg, model_name):
    params = {
        'num_boost_round': 100000
    }
    return params

def get_result(result_df):
    pred_cols = [f'pred_{i}' for i in range(len(TARGET_COLUMNS))]

    preds = result_df[pred_cols].values
    labels = result_df[TARGET_COLUMNS].values

    eval_func = eval('mae')
    best_score = eval_func(labels, preds)

    print(f'best_score: {best_score:<.4f}')
    return best_score

In [8]:
cfg = OmegaConf.load('config/config.yaml')

In [9]:
cfg

{'train': False, 'oof': False, 'wandb': False, 'debug': False, 'wandb_project': 'atmacup18_gbdt', 'seed': 77, 'n_folds': 5, 'use_traffic_light': False, 'use_epipolar': False, 'oof_ids': ['exp0002'], 'oof_v': True, 'oof_feature': False, 'oof_shift': False, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'num_leaves': 64, 'max_depth': -1, 'min_data_in_leaf': 64, 'subsample': 0.4, 'colsample_bytree': 0.4, 'hydra': {'run': {'dir': './'}, 'output_subdir': None, 'job_logging': {'version': 1, 'handlers': {'console': {'class': 'logging.StreamHandler', 'stream': 'ext://sys.stdout'}}, 'root': {'handlers': ['console']}, 'disable_existing_loggers': False}}}

In [10]:
cfg.oof_ids = ['exp0002']
cfg.oof_shift = False

In [11]:
cfg.num_leaves = 64
cfg.max_depth = -1
cfg.min_data_in_leaf = 64
cfg.colsample_bytree = 0.4
cfg.subsample = 0.4
cfg.boosting_type = 'gbdt' 

In [12]:
fold = 0
target_column = 'x_5'

In [18]:
seed_everything(cfg.seed)

y = raw_train_df[TARGET_COLUMNS]
train_with_fold_df = split_data(cfg, raw_train_df)

oof_predictions = np.zeros((raw_train_df.shape[0], len(TARGET_COLUMNS)))
test_predictions = np.zeros((raw_test_df.shape[0], len(TARGET_COLUMNS)))


train_indices = train_with_fold_df['fold'] != fold
valid_indices = train_with_fold_df['fold'] == fold

# preprocess
train_df, common_num_columns = common_preprocess(train_with_fold_df)
test_df, _ = common_preprocess(raw_test_df)

# traffic_light
if cfg.use_traffic_light:
    train_df, test_df = add_traffic_light_feature(train_df, test_df)

# oof
if cfg.oof_ids is not None and len(cfg.oof_ids) > 0:
    img_oof_paths = []
    img_submissions_paths = []
    for oof_id in cfg.oof_ids:
        if cfg.oof_feature:
            img_oof_paths.append(OUTPUT_DIR / 'exp' / oof_id / 'oof_feature.csv')
            img_submissions_paths.append(OUTPUT_DIR / 'exp' / oof_id / f'submission_feature_fold{fold}.csv')
        else:
            img_oof_paths.append(OUTPUT_DIR / 'exp' / oof_id / 'oof.csv')
            img_submissions_paths.append(OUTPUT_DIR / 'exp' / oof_id / f'submission_fold{fold}.csv')

    train_df, test_df, oof_feat_columns = add_oof_feature(
        train_df,
        test_df,
        img_oof_paths,
        img_submissions_paths,
        oof_feature=cfg.oof_feature
    )
else:
    oof_feat_columns = []

# shift
use_shift_columns = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brakePressed', 'gas', 'gasPressed',  'leftBlinker', 'rightBlinker']
if cfg.oof_shift:
    use_shift_columns += oof_feat_columns
train_df, shift_columns = make_shift_feature(train_df, use_shift_columns)
test_df, shift_columns = make_shift_feature(test_df, use_shift_columns)

# feature block
num_columns = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brakePressed', 'gas', 'gasPressed',  'leftBlinker', 'rightBlinker']
num_columns += common_num_columns
num_columns += oof_feat_columns
num_columns += shift_columns
if cfg.use_traffic_light:
    num_columns += ['traffic_lights_counts']

agg_num_columns = ['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'gas']

cat_label_columns = ['gearShifter']
cat_count_columns = []
cat_te_columns = []

train_df, test_df, train_feat, test_feat = add_feature_block(
    train_df,
    test_df,
    num_columns=num_columns,
    agg_num_columns=agg_num_columns,
    cat_label_columns=cat_label_columns,
    cat_count_columns=cat_count_columns,
    cat_te_columns=cat_te_columns
)

print(f'feature columns:', train_feat.columns)
print(f'num feature columns:', len(train_feat.columns))

# for target_idx, target_column in enumerate(TARGET_COLUMNS):
    
target_idx = TARGET_COLUMNS.index(target_column)
print(f'fold: {fold}, target_column: {target_column}')

x_train = train_feat.loc[train_indices]
x_valid = train_feat.loc[valid_indices]
y_train = train_df.loc[train_indices, target_column]
y_valid = train_df.loc[valid_indices, target_column]

model_name = f'lgb_{target_column}'
model = get_model(cfg, model_name)

fit_params = get_fit_params(cfg, model_name)

fit_params_fold = fit_params.copy()
fit_params_fold['eval_set'] = [(x_valid, y_valid)]

model.fit(x_train, y_train, **fit_params_fold)

oof_predictions[valid_indices, target_idx] = model.predict(x_valid)
test_predictions[:, target_idx] += model.predict(test_feat)
eval_func = eval('mae')
score_fold = eval_func(y.loc[valid_indices, target_column].values, oof_predictions[valid_indices, target_idx])
print(f'fold: {fold}, score: {score_fold:<.4f}')

feature columns: Index(['vEgo', 'aEgo', 'steeringAngleDeg', 'steeringTorque', 'brake',
       'brakePressed', 'gas', 'gasPressed', 'leftBlinker', 'rightBlinker',
       'scene_sec', 'scene_count', 'exp0002_x_0', 'exp0002_y_0', 'exp0002_z_0',
       'exp0002_x_1', 'exp0002_y_1', 'exp0002_z_1', 'exp0002_x_2',
       'exp0002_y_2', 'exp0002_z_2', 'exp0002_x_3', 'exp0002_y_3',
       'exp0002_z_3', 'exp0002_x_4', 'exp0002_y_4', 'exp0002_z_4',
       'exp0002_x_5', 'exp0002_y_5', 'exp0002_z_5', 'vEgo_shift-1',
       'vEgo_diff-1', 'aEgo_shift-1', 'aEgo_diff-1',
       'steeringAngleDeg_shift-1', 'steeringAngleDeg_diff-1',
       'steeringTorque_shift-1', 'steeringTorque_diff-1', 'brake_shift-1',
       'brake_diff-1', 'brakePressed_shift-1', 'brakePressed_diff-1',
       'gas_shift-1', 'gas_diff-1', 'gasPressed_shift-1', 'gasPressed_diff-1',
       'leftBlinker_shift-1', 'leftBlinker_diff-1', 'rightBlinker_shift-1',
       'rightBlinker_diff-1', 'vEgo_shift1', 'vEgo_diff1', 'aEgo_shift1',


In [None]:
lightgbm.plot_importance(model.model, figsize=(8,20), max_num_features=30, importance_type='gain')