In [62]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
import pycaret
from pycaret.classification import *
import optuna
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn import tree

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [55]:
df = pd.read_csv('train_merged.csv')
ignore_features = ['datetime', 'date']
categorical_features = ['day_of_week', 'is_congestion', 'tomorrow_holiday', 'yesterday_holiday', 'start_code', 'end_code', 'section']

df['date'] = pd.to_datetime(df['date'])
df_train = df[df['date'] < dt.datetime(2023, 1, 1)]
df_test = df[df['date'] >= dt.datetime(2023, 1, 1)]

In [56]:
exp = setup(df_train, target='is_congestion_tomorrow', fold_strategy='timeseries', data_split_stratify=False, train_size=0.8, data_split_shuffle=False, fold_shuffle=False, use_gpu=False, ignore_features=ignore_features, categorical_features=categorical_features, fold=4, session_id=123)



Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_congestion_tomorrow
2,Target type,Binary
3,Original data shape,"(4344912, 42)"
4,Transformed data shape,"(4344912, 46)"
5,Transformed train set shape,"(3475929, 46)"
6,Transformed test set shape,"(868983, 46)"
7,Ignore features,2
8,Ordinal features,3
9,Numeric features,32


In [57]:
xgboost = create_model('xgboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9966,0.9601,0.0799,0.4987,0.1378,0.137,0.1987
1,0.9957,0.9736,0.2948,0.6273,0.4011,0.3992,0.4282
2,0.9966,0.9709,0.2073,0.5188,0.2962,0.2948,0.3265
3,0.9986,0.9796,0.1594,0.5222,0.2442,0.2437,0.288
Mean,0.9969,0.9711,0.1854,0.5417,0.2698,0.2687,0.3104
Std,0.0011,0.0071,0.0779,0.0502,0.0949,0.0945,0.0823


In [58]:
predict = predict_model(xgboost, data=df_test).prediction_label

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9965,0.9714,0.1968,0.6508,0.3022,0.301,0.3567


In [59]:
answer = df_test['is_congestion_tomorrow']

accuracy = accuracy_score(answer, predict)
recall = recall_score(answer, predict)
precision = precision_score(answer, predict)
f1 = f1_score(answer, predict)

print('TN  FP\nFN  TP')
print(confusion_matrix(answer, predict))
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 score : {f1}')

TN  FP
FN  TP
[[1442158     587]
 [   4465    1094]]
Accuracy: 0.9965117820568058
Recall: 0.19679798524914552
Precision: 0.6508030933967877
F1 score : 0.30220994475138124


In [60]:
evaluate_model(xgboost)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…