In [51]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
import pycaret
from pycaret.regression import *
import optuna
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [52]:
df = pd.read_csv('train_merged.csv')
ignore_features = ['datetime', 'is_congestion_tomorrow', 'date']
categorical_features = ['day_of_week', 'tomorrow_holiday', 'yesterday_holiday']

df['date'] = pd.to_datetime(df['date'])
df_train = df[df['date'] < dt.datetime(2023, 1, 1)]
df_test = df[df['date'] >= dt.datetime(2023, 1, 1)]

In [53]:
exp = setup(df_train, target='speed_diff', fold_strategy='timeseries', train_size=0.8, data_split_shuffle=False, fold_shuffle=False, use_gpu=False, ignore_features=ignore_features, categorical_features=categorical_features, fold=4, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,speed_diff
2,Target type,Regression
3,Original data shape,"(4344912, 39)"
4,Transformed data shape,"(4344912, 39)"
5,Transformed train set shape,"(3475929, 39)"
6,Transformed test set shape,"(868983, 39)"
7,Ignore features,6
8,Ordinal features,2
9,Numeric features,29


In [54]:
catboost = create_model('catboost')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.9383,19.7872,4.4483,0.5444,0.5904,4.9017
1,1.8833,18.9299,4.3509,0.6096,0.5738,5.7984
2,1.6335,15.479,3.9343,0.6395,0.5307,3.4999
3,1.5129,8.957,2.9928,0.6695,0.5188,3.7193
Mean,1.742,15.7883,3.9316,0.6158,0.5534,4.4798
Std,0.1752,4.261,0.5753,0.0463,0.0296,0.9294


In [55]:
predict = predict_model(catboost, data=df_test).prediction_label

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,1.9076,19.0261,4.3619,0.6275,0.5702,4.0429


In [56]:
predict = predict.to_frame('speed_diff')
answer = df_test[['speed', 'speed_diff']]

predict['predict_speed'] = df_test['speed'] + predict['speed_diff']
answer['answer_speed'] = df_test['speed'] + answer['speed_diff']

predict['is_congestion'] = predict.eval('predict_speed < 40').astype(int)
answer['is_congestion'] = answer.eval('answer_speed < 40').astype(int)

In [57]:
accuracy = accuracy_score(answer['is_congestion'], predict['is_congestion'])
recall = recall_score(answer['is_congestion'], predict['is_congestion'])
precision = precision_score(answer['is_congestion'], predict['is_congestion'])
f1 = f1_score(answer['is_congestion'], predict['is_congestion'])

print('TN  FP\nFN  TP')
print(confusion_matrix(answer['is_congestion'], predict['is_congestion']))
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precisoin: {precision}')
print(f'F1 score : {f1}')

TN  FP
FN  TP
[[1442680      65]
 [   5152     407]]
Accuracy: 0.9963978556988036
Recall: 0.07321460694369492
Precisoin: 0.8622881355932204
F1 score : 0.13496932515337423


In [58]:
evaluate_model(catboost)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…