In [1]:
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score

In [3]:
import warnings
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
warnings.filterwarnings('ignore')

In [15]:
def f1_score_eval(y_true, y_pred):
    scores = f1_score(y_true=y_true, y_pred=y_pred, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.6
    return scores

In [25]:
def search_f1(label, oof, sub):
    best = 0
    best_t0 = 0.0
    best_t1 = 0.0
    t0 = oof.min()
    step = 0.01
    while t0 < 1.0:
        pred0 = (oof<t0).astype(int)
        t1 = t0+step
        while t1 < 3.0:
            pred1 = ((oof>t0)&(oof<t1)).astype(int)*2
            pred3 = (oof>t1).astype(int)*3
            pred = pred0 + pred1 + pred3
            score = f1_score_eval(label, pred)
            if score > best:
                best = score
                best_t0 = t0
                best_t1 = t1
            t1+=step
        t0+=step
    
    print('best_f1_score: {} | best_threshold: {}'.format(best, [best_t0, best_t1]))
    pred_sub = (sub<best_t0).astype(int) + ((sub>best_t0)&(sub<best_t1)).astype(int)*2 + (sub>best_t1).astype(int)*3
    return pred_sub

In [4]:
traffic = pd.read_pickle('data/traffic.pkl')
traffic.head()

Unnamed: 0,date,link_id,cur_time,pred_time,label,rec_speed_0,rec_speed_1,rec_speed_2,rec_speed_3,rec_speed_4,rec_eta_speed_0,rec_eta_speed_1,rec_eta_speed_2,rec_eta_speed_3,rec_eta_speed_4,rec_car_count_0,rec_car_count_1,rec_car_count_2,rec_car_count_3,rec_car_count_4,rec_label_0,rec_label_1,rec_label_2,rec_label_3,rec_label_4,his_speed_0_0,his_speed_0_1,his_speed_0_2,his_speed_0_3,his_speed_0_4,his_speed_1_0,his_speed_1_1,his_speed_1_2,his_speed_1_3,his_speed_1_4,his_speed_2_0,his_speed_2_1,his_speed_2_2,his_speed_2_3,his_speed_2_4,his_speed_3_0,his_speed_3_1,his_speed_3_2,his_speed_3_3,his_speed_3_4,his_eta_speed_0_0,his_eta_speed_0_1,his_eta_speed_0_2,his_eta_speed_0_3,his_eta_speed_0_4,his_eta_speed_1_0,his_eta_speed_1_1,his_eta_speed_1_2,his_eta_speed_1_3,his_eta_speed_1_4,his_eta_speed_2_0,his_eta_speed_2_1,his_eta_speed_2_2,his_eta_speed_2_3,his_eta_speed_2_4,his_eta_speed_3_0,his_eta_speed_3_1,his_eta_speed_3_2,his_eta_speed_3_3,his_eta_speed_3_4,his_car_count_0_0,his_car_count_0_1,his_car_count_0_2,his_car_count_0_3,his_car_count_0_4,his_car_count_1_0,his_car_count_1_1,his_car_count_1_2,his_car_count_1_3,his_car_count_1_4,his_car_count_2_0,his_car_count_2_1,his_car_count_2_2,his_car_count_2_3,his_car_count_2_4,his_car_count_3_0,his_car_count_3_1,his_car_count_3_2,his_car_count_3_3,his_car_count_3_4,his_label_0_0,his_label_0_1,his_label_0_2,his_label_0_3,his_label_0_4,his_label_1_0,his_label_1_1,his_label_1_2,his_label_1_3,his_label_1_4,his_label_2_0,his_label_2_1,his_label_2_2,his_label_2_3,his_label_2_4,his_label_3_0,his_label_3_1,his_label_3_2,his_label_3_3,his_label_3_4
0,1,1049,258,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,43.1875,38.1875,33.59375,27.703125,23.5,38.5,38.5,39.40625,32.59375,32.59375,13.398438,13.398438,13.398438,28.90625,39.3125,30.0,30.0,33.1875,30.0,30.0,23.0,23.0,25.90625,22.90625,17.59375,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,11.296875,11.296875,9.601562,9.296875,9.296875,18.90625,17.0,17.0,1,1,2,3,2,1,1,1,1,1,3,3,3,3,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,0,0,1,0,0
1,1,1049,261,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3
2,1,1049,264,288,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,43.1875,38.1875,33.59375,27.703125,23.5,38.5,38.5,39.40625,32.59375,32.59375,13.398438,13.398438,13.398438,28.90625,39.3125,30.0,30.0,33.1875,30.0,30.0,23.0,23.0,25.90625,22.90625,17.59375,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,11.296875,11.296875,9.601562,9.296875,9.296875,18.90625,17.0,17.0,1,1,2,3,2,1,1,1,1,1,3,3,3,3,1,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,1,1,0,0,1,0,0
3,1,1049,266,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3
4,1,1049,272,290,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,33.59375,27.703125,23.5,27.796875,27.796875,39.40625,32.59375,32.59375,32.59375,32.59375,13.398438,28.90625,39.3125,39.3125,39.3125,33.1875,30.0,30.0,7.699219,7.699219,25.90625,22.90625,17.59375,27.703125,27.703125,30.09375,30.09375,30.09375,30.09375,30.09375,11.296875,11.296875,9.601562,9.601562,9.601562,18.90625,17.0,17.0,17.0,17.0,2,3,2,3,3,1,1,1,1,1,3,3,1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,0,0,3,3


In [5]:
attr = pd.read_csv('data/attr.csv')
attr.head()

Unnamed: 0,link_id,length,direction,path_class,speed_class,lane_num,speed_limit,level,width
0,0,19,1,5,7,1,4.168,5,30
1,1,19,1,5,7,1,4.168,5,30
2,2,16,1,5,7,1,4.168,5,30
3,3,16,1,5,7,1,4.168,5,30
4,4,17,1,5,7,1,4.168,5,30


In [6]:
traffic = traffic[traffic.date.isin([26,27,28,29,30,31])].reset_index(drop=True)
traffic = traffic.merge(attr, how='left', on='link_id')
traffic['time_diff'] = traffic.pred_time - traffic.cur_time

In [7]:
df_train = traffic[traffic.label>0].reset_index(drop=True)
df_test = traffic[traffic.label<0].reset_index(drop=True)

In [8]:
del traffic
del attr
gc.collect()

7

In [9]:
feats = df_train.columns.drop(['date', 'label'])
category_feats = ['link_id', 'direction']

In [10]:
class_weight = dict(df_train.shape[0] / (3 * df_train.label.value_counts()))
df_train['weight'] = df_train.label.map(class_weight)

In [11]:
oof = np.zeros(df_train.shape[0])
sub = np.zeros(df_test.shape[0])
feat_imp_df = pd.DataFrame({'feat': feats, 'imp': 0.0})
gkf = GroupKFold(n_splits=5)

In [12]:
print('train shape {} test shape {}'.format(df_train.shape, df_test.shape))

train shape (2479215, 115) test shape (504891, 114)


In [4]:
clf = LGBMRegressor(
        num_leaves=63,
        learning_rate=0.1,
        n_estimators=100000,
        subsample=0.6,
        colsample_bytree=0.6,
        random_state=2020,
        n_jobs=24,
    )

In [5]:
clf.fit?

[0;31mSignature:[0m
[0mclf[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minit_score[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_set[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_sample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_init_score[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meval_metric[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mearly_stopping_rounds[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_name[0m[0;34m=[

In [13]:
for i, (trn_idx, val_idx) in enumerate(gkf.split(df_train, groups=(df_train.date.map(str) + '_' + df_train.link_id.map(str)))):
    print('------------------------------{} fold------------------------------'.format(i))
    X_trn, Y_trn, W_trn = df_train.iloc[trn_idx][feats], df_train.iloc[trn_idx].label, df_train.iloc[trn_idx].weight
    X_val, Y_val, W_val = df_train.iloc[val_idx][feats], df_train.iloc[val_idx].label, df_train.iloc[val_idx].weight
    X_sub = df_test[feats]
    
    clf = LGBMRegressor(
        objective='rmse',
        num_leaves=63,
        learning_rate=0.1,
        n_estimators=100000,
        subsample=0.6,
        colsample_bytree=0.6,
        random_state=2020,
        n_jobs=24,
    )
    
    clf.fit(
        X_trn, Y_trn,
        sample_weight= W_trn,
        eval_set=[(X_val, Y_val)],
        eval_sample_weight=[W_val],
        early_stopping_rounds=200,
        categorical_feature=category_feats,
        verbose=500,
    )
    
    oof[val_idx] = clf.predict(X_val)
    sub +=  clf.predict(X_sub) / gkf.n_splits
    feat_imp_df['imp'] += clf.feature_importances_ / gkf.n_splits

------------------------------0 fold------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 0.543871
------------------------------1 fold------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 0.544061
------------------------------2 fold------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[89]	valid_0's rmse: 0.544151
------------------------------3 fold------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[66]	valid_0's rmse: 0.548514
------------------------------4 fold------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	valid_0's rmse: 0.543226


In [None]:
pred_sub = search_f1(df_train.label, oof, sub)

In [None]:
plt.figure(figsize=(15, 30))
feat_imp_df = feat_imp_df.sort_values('imp', ignore_index=True)
sns.barplot(x='imp', y='feat', data=feat_imp_df)
plt.savefig('imp.png')

In [None]:
pd.DataFrame({
    'link': df_test.link_id,
    'current_slice_id': df_test.cur_time,
    'future_slice_id': df_test.pred_time,
    'label': pred_sub,
}).to_csv('sub.csv', index=False)