In [1]:
import time
import numpy as np
import pandas as pd
from pandas import Timestamp
import tqdm
from pycaret.time_series import *

In [2]:
train = pd.read_csv("Data/train.csv", index_col = 0)
train['time'] = pd.to_datetime(train['time'])
train['road_id'] = train['x'].astype(str) + train['y'].astype(str) + train['direction']
train.shape

(848835, 6)

In [3]:
test = pd.read_csv("Data/test.csv", index_col = 0)
test['time'] = pd.to_datetime(test['time'])
test['congestion'] = np.nan
test['road_id'] = test['x'].astype(str) + test['y'].astype(str) + test['direction']
test.shape

(2340, 6)

In [4]:
train.head()

Unnamed: 0_level_0,time,x,y,direction,congestion,road_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1991-04-01,0,0,EB,70,00EB
1,1991-04-01,0,0,NB,49,00NB
2,1991-04-01,0,0,SB,24,00SB
3,1991-04-01,0,1,EB,18,01EB
4,1991-04-01,0,1,NB,60,01NB


In [5]:
test.head()

Unnamed: 0_level_0,time,x,y,direction,congestion,road_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
848835,1991-09-30 12:00:00,0,0,EB,,00EB
848836,1991-09-30 12:00:00,0,0,NB,,00NB
848837,1991-09-30 12:00:00,0,0,SB,,00SB
848838,1991-09-30 12:00:00,0,1,EB,,01EB
848839,1991-09-30 12:00:00,0,1,NB,,01NB


# Run Experiment - just time series per road

In [39]:
train_data = train[['time', 'road_id', 'congestion']].copy()
test_data = test[['time', 'road_id', 'congestion']].copy()

In [47]:
all_roads = train_data['road_id'].unique()
all_models = {}

fh = test_data.time.nunique()
all_predictions = pd.DataFrame()

for road in tqdm.tqdm(all_roads):
    
    df_subset = train_data[train_data['road_id'] == road].set_index('time').asfreq('20min').drop("road_id", axis = 1)

    s = setup(df_subset, fh = fh, session_id = 123)
    model = create_model('auto_arima', fold = 4)
    
    # finalize model i.e. fit on entire data including test set
    final_model = finalize_model(model)
    
    # attach final model to a dictionary
    all_models[road] = final_model
    
    # save transformation pipeline and model as pickle file 
    save_model(final_model, model_name='Models/' + str(road), verbose=False)
    
    # make predictions
    test_df_subset = test_data[test_data['road_id'] == road].set_index('time').asfreq('20min')
    predictions = predict_model(final_model, fh = fh, X = test_df_subset)
    predictions['road_id'] = road
    all_predictions = pd.concat([all_predictions, predictions])

Unnamed: 0,cutoff,MAE,RMSE,MAPE,SMAPE,MASE,RMSSE,R2
0,1991-09-27 23:40,10.4704,13.0633,0.3813,0.2523,1.0998,0.9805,-0.164
1,1991-09-28 11:40,8.7262,10.3447,0.2881,0.2343,0.9165,0.7765,-0.3076
2,1991-09-28 23:40,12.4332,15.4908,0.3554,0.3011,1.3057,1.1631,-0.1079
3,1991-09-29 11:40,9.9005,12.648,0.4111,0.2774,1.0393,0.949,-0.029
Mean,,10.3826,12.8867,0.359,0.2663,1.0903,0.9673,-0.1521
SD,,1.3406,1.8256,0.0454,0.0253,0.1408,0.1371,0.1018


100%|███████████████████████████████████████████████████████████████████████████████| 65/65 [8:29:25<00:00, 470.23s/it]


In [48]:
all_predictions

Unnamed: 0,0
1991-09-30 12:00,54.8401
1991-09-30 12:20,52.3116
1991-09-30 12:40,51.8898
1991-09-30 13:00,51.8195
1991-09-30 13:20,51.8078
...,...
1991-09-30 22:20,40.3127
1991-09-30 22:40,40.3127
1991-09-30 23:00,40.3127
1991-09-30 23:20,40.3127


In [84]:
temp = all_predictions.copy().reset_index()
temp['road_id'] = ''
for i, road in enumerate(all_roads):
    temp.loc[i * 65: (i + 1)* 65, 'road_id'] = road
    
temp.columns = ['time', 'prediction', 'road_id']
temp.time = temp.time.astype('datetime64[ns]') 
temp

Unnamed: 0,time,prediction,road_id
0,1991-09-30 12:00:00,54.8401,00EB
1,1991-09-30 12:20:00,52.3116,00EB
2,1991-09-30 12:40:00,51.8898,00EB
3,1991-09-30 13:00:00,51.8195,00EB
4,1991-09-30 13:20:00,51.8078,00EB
...,...,...,...
2335,1991-09-30 22:20:00,40.3127,13NE
2336,1991-09-30 22:40:00,40.3127,13NE
2337,1991-09-30 23:00:00,40.3127,13NE
2338,1991-09-30 23:20:00,40.3127,13NE


In [68]:
test

Unnamed: 0_level_0,time,x,y,direction,congestion,road_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
848835,1991-09-30 12:00:00,0,0,EB,,00EB
848836,1991-09-30 12:00:00,0,0,NB,,00NB
848837,1991-09-30 12:00:00,0,0,SB,,00SB
848838,1991-09-30 12:00:00,0,1,EB,,01EB
848839,1991-09-30 12:00:00,0,1,NB,,01NB
...,...,...,...,...,...,...
851170,1991-09-30 23:40:00,2,3,NB,,23NB
851171,1991-09-30 23:40:00,2,3,NE,,23NE
851172,1991-09-30 23:40:00,2,3,SB,,23SB
851173,1991-09-30 23:40:00,2,3,SW,,23SW


In [88]:
test_predictions = test.merge(temp, on = ['time', 'road_id'])
test_predictions.index = test.index
test_predictions = test_predictions[['prediction']]
test_predictions.columns = ['congestion']
test_predictions

Unnamed: 0_level_0,congestion
row_id,Unnamed: 1_level_1
848835,54.8401
848836,37.6557
848837,52.1088
848838,23.9992
848839,65.8589
...,...
851170,58.6101
851171,32.0911
851172,70.6181
851173,19.5241


In [67]:
test.shape

(2340, 6)

In [91]:
test_predictions.to_csv("Submissions/pycaret_time_series.csv")

# Train on just weekday data

In [6]:
train_data = train[['time', 'road_id', 'congestion']].copy()
test_data = test[['time', 'road_id', 'congestion']].copy()

In [None]:
all_roads = train_data['road_id'].unique()
all_models = {}

fh = test_data.time.nunique()
all_predictions = pd.DataFrame()

for road in tqdm.tqdm(all_roads):
    
    df_subset = train_data[train_data['road_id'] == road].set_index('time').asfreq('20min').drop("road_id", axis = 1)

    s = setup(df_subset, fh = fh, session_id = 123)
    model = compare_models(fold = 4)
    
    # finalize model i.e. fit on entire data including test set
    final_model = finalize_model(model)
    
    # attach final model to a dictionary
    all_models[road] = final_model
    
    # save transformation pipeline and model as pickle file 
    save_model(final_model, model_name='Models/' + str(road), verbose=False)
    
    # make predictions
    test_df_subset = test_data[test_data['road_id'] == road].set_index('time').asfreq('20min')
    predictions = predict_model(final_model, fh = fh, X = test_df_subset)
    predictions['road_id'] = road
    all_predictions = pd.concat([all_predictions, predictions])

In [None]:
all_predictions

In [None]:
temp = all_predictions.copy().reset_index()
temp['road_id'] = ''
for i, road in enumerate(all_roads):
    temp.loc[i * 65: (i + 1)* 65, 'road_id'] = road
    
temp.columns = ['time', 'prediction', 'road_id']
temp.time = temp.time.astype('datetime64[ns]') 
temp

In [None]:
test

In [None]:
test_predictions = test.merge(temp, on = ['time', 'road_id'])
test_predictions.index = test.index
test_predictions = test_predictions[['prediction']]
test_predictions.columns = ['congestion']
test_predictions

In [None]:
test.shape

In [None]:
test_predictions.to_csv("Submissions/pycaret_time_series.csv")