# mount drive

In [None]:
use_colaboratory = True
if use_colaboratory:
    import os
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/kaggle/tabular-playground-series-mar-2022/notebook')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import packages
import datetime
import math
import pickle

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline

In [None]:
# load data
train = pd.read_csv("../input/train.csv", index_col="row_id")
test = pd.read_csv("../input/test.csv", index_col="row_id")

In [None]:
data = pd.concat([train, test])
data["time"] = pd.to_datetime(data["time"])

In [None]:
data["hour"] = data["time"].dt.hour
data["minute"] = data["time"].dt.minute
data["weekday"] = data["time"].dt.weekday
data["isAM"] = data.hour < 12
data["isweekend"] = data.weekday >= 5

In [None]:
ld_col = ["x", "y", "direction"]

weekday_mean = data.iloc[:len(train)].groupby(ld_col + ["weekday"]).congestion.mean()
hour_mean = data.iloc[:len(train)].groupby(ld_col + ["hour"]).congestion.mean()

weekday_mean.colmuns = ["weekday_mean"]
hour_mean.colmuns = ["hour_mean"]

data = data.merge(weekday_mean, how="left", left_on=ld_col+["weekday"], right_index=True, suffixes=("", "_weekday_mean"))
data = data.merge(hour_mean, how="left", left_on=ld_col+["hour"], right_index=True, suffixes=("", "_hour_mean"))
data.head()

Unnamed: 0_level_0,time,x,y,direction,congestion,hour,minute,weekday,isAM,isweekend,congestion_weekday_mean,congestion_hour_mean
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1991-04-01,0,0,EB,70.0,0,0,0,1,0,48.381551,40.708561
1,1991-04-01,0,0,NB,49.0,0,0,0,1,0,39.502621,38.258652
2,1991-04-01,0,0,SB,24.0,0,0,0,1,0,50.679245,42.615665
3,1991-04-01,0,1,EB,18.0,0,0,0,1,0,27.278302,23.601093
4,1991-04-01,0,1,NB,60.0,0,0,0,1,0,69.612159,67.551913


In [None]:
train = data.iloc[:len(train)]
test = data.iloc[len(train):]

In [None]:
xs = train.x.unique().tolist()
ys = train.y.unique().tolist()
dirs = train.direction.unique().tolist()

feature_col = ["hour", "minute", "weekday", "isAM", "isweekend", "congestion_weekday_mean", "congestion_hour_mean"]

params = {
    "objective": "regression",
    "metric": "mae",
    "learning_rate": 0.05,
}

oof_train = np.zeros((len(train), ))
oof_idx = []
all_models = {}
all_y_preds = {}

for x in xs:
    for y in ys:
        for dir in dirs:
            loc_dir = (x, y, dir)
            print("======== loop: {} ========".format(loc_dir))

            locdir_X_train = train[(train.x == x) & (train.y == y) & (train.direction == dir)][feature_col]
            locdir_y_train = train[(train.x == x) & (train.y == y) & (train.direction == dir)]["congestion"]
            locdir_X_test = test[(test.x == x) & (test.y == y) & (test.direction == dir)][feature_col]

            # loc & dir の組み合わせが存在しないときは処理しない
            if locdir_X_train.shape[0] == 0:
                print("{} is nothing".format(loc_dir))
                continue

            cv = TimeSeriesSplit(n_splits=5)

            y_preds = []
            models = []

            for fold_idx, (tr_idx, val_idx) in enumerate(cv.split(locdir_X_train)):
                print(f"======== fold: {fold_idx} ========")
                X_tr, X_val = locdir_X_train.iloc[tr_idx], locdir_X_train.iloc[val_idx]
                y_tr, y_val = locdir_y_train.iloc[tr_idx], locdir_y_train.iloc[val_idx]
                
                lgb_train = lgb.Dataset(X_tr, y_tr)
                lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
                
                model = lgb.train(params, lgb_train,
                                valid_sets=[lgb_train, lgb_eval],
                                verbose_eval=10,
                                num_boost_round=1000,
                                early_stopping_rounds=10)
                
                oof_train[X_val.index] = \
                    model.predict(X_val, num_iteration=model.best_iteration)

                oof_idx += X_val.index.tolist()

                y_pred = model.predict(locdir_X_test, num_iteration=model.best_iteration)

                y_preds.append(y_pred)
                models.append(model)
            
            all_y_preds[loc_dir] = y_preds
            all_models[loc_dir] = models

Training until validation scores don't improve for 10 rounds.
[10]	training's l1: 8.3173	valid_1's l1: 9.76613
[20]	training's l1: 7.93986	valid_1's l1: 9.74217
Early stopping, best iteration is:
[16]	training's l1: 8.06932	valid_1's l1: 9.73462
Training until validation scores don't improve for 10 rounds.
[10]	training's l1: 9.01351	valid_1's l1: 9.8278
[20]	training's l1: 8.74597	valid_1's l1: 9.62383
[30]	training's l1: 8.58955	valid_1's l1: 9.52621
[40]	training's l1: 8.4766	valid_1's l1: 9.48188
[50]	training's l1: 8.40155	valid_1's l1: 9.46469
[60]	training's l1: 8.3439	valid_1's l1: 9.46032
[70]	training's l1: 8.30911	valid_1's l1: 9.46044
Early stopping, best iteration is:
[63]	training's l1: 8.33401	valid_1's l1: 9.45645
Training until validation scores don't improve for 10 rounds.
[10]	training's l1: 9.23507	valid_1's l1: 9.47297
[20]	training's l1: 8.96654	valid_1's l1: 9.23039
[30]	training's l1: 8.8323	valid_1's l1: 9.12418
[40]	training's l1: 8.74632	valid_1's l1: 9.07801

KeyboardInterrupt: ignored

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(oof_train[oof_idx], bins=100, label="oof", alpha=0.5)
plt.hist(train.iloc[oof_idx].congestion, bins=100, label="y_train", alpha=0.5)
plt.legend()

In [None]:
scores = {}

for ld, ms in all_models.items():
    scores[ld] = ms[-1].best_score["valid_1"]["l1"]

score = sum(scores.values()) / len(scores)
print("=== CV scores ===")
print(scores)
print(score)

In [None]:
with open("../output/all_y_preds_008.pickle", "wb") as f:
    pickle.dump(all_y_preds, f)

with open("../output/all_models_008.pickle", "wb") as f:
    pickle.dump(all_models, f)

with open("../output/oof_train_008.pickle", "wb") as f:
    pickle.dump(oof_train, f)

with open("../output/oof_idx_008.pickle", "wb") as f:
    pickle.dump(oof_idx, f)

In [None]:
sub_pred = test.copy()[["x", "y", "direction", "congestion"]]

In [None]:
for (x, y, dir), y_preds in all_y_preds.items():
    sub_pred.loc[((test.x == x) & (test.y == y) & (test.direction == dir)), ["congestion"]] = y_preds[-1]

In [None]:
submission = pd.read_csv("../input/sample_submission.csv")
submission.congestion = sub_pred.congestion.to_numpy()
submission.to_csv("../submit/submission_008.csv", index=False)

In [None]:
for key, models in all_models.items():
    f_importance = models[-1].feature_importance()
    f_importance = f_importance / sum(f_importance)
    df_importance = pd.DataFrame({"feature": feature_col, "importance": f_importance}).sort_values("importance", ascending=False)
    print("==========={}===========".format(key))
    plt.figure(figsize=(6, 3))
    plt.bar(x="feature", height="importance", data=df_importance)
    plt.xticks(rotation=90)
    plt.show()

### submit

In [None]:
import json
with open('../../kaggle.json') as f:
    json_data = json.load(f)
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

!pip install kaggle --upgrade
!kaggle -v

In [None]:
# submission
!kaggle competitions submit tabular-playground-series-mar-2022 -f ../submit/submission_008.csv -m "008"