In [50]:
# Importing the train and test data, indexing and sorting it by flight date

import pandas as pd

train = pd.read_csv("/content/flights_train.csv", index_col ="flight_date")
train = train.sort_values(by='flight_date')

test = pd.read_csv("/content/flights_Xtest.csv",index_col ="flight_date")
test["ROUTE"] = test["from"] +"-" + test["to"]
test = test.sort_values(by='flight_date')

In [51]:
# Creating a new column named ROUTE in order to ease slicing

train["ROUTE"] = train["from"] +"-" + train["to"]

In [52]:
# Creating a dictionary where the data is sliced by the opted ROUTE, 
# we will only use the target variable to get a proper time series

unique_routes = list(train["ROUTE"].unique())
dict_data = dict()
for i in unique_routes :
  dict_data[i] = train[train['ROUTE'] == i]
  dict_data[i] = dict_data[i]["target"]

dict_test = dict()
for i in unique_routes :
  dict_test[i] = test[test['ROUTE'] == i]
  dict_test[i]["key"] = dict_test[i]['ROUTE'] + dict_test[i].index
  #dict_test[i] = dict_test[i].index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [6]:
# Importing SARIMAX to perform forecast, and the mse metric to evaluate results

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error


In [None]:
# Fine-tuning the best parameters p, d, q, for SARIMA can be time consuming we will thus choose 
# some tuples where we will perform GridSearch
dict_results = dict()
p_d_q_list = [(0, 1, 0), (1, 0, 0), (0, 0, 1), (1, 1, 1), (1, 0, 1), (1, 1, 0), (0, 1, 1), (0, 1, 2), (1,1,2), (2,2,1), (1, 2, 1), (2, 1, 1), (2, 1, 5)]

for i in p_d_q_list :
  results = []
  for route_data in dict_data :
    SARIMAXmodel = SARIMAX(dict_data[route_data][:int(0.8*dict_data[route_data].shape[0])], order = i, seasonal_order=(1,1,2,4))
    SARIMAXmodel = SARIMAXmodel.fit()
    #len(test.index)
    y_pred = SARIMAXmodel.get_forecast(dict_data[route_data][int(0.8*dict_data[route_data].shape[0]):].shape[0])
    y_pred_df = y_pred.conf_int(alpha = 0.05) 
    y_pred_df["Predictions"] = SARIMAXmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
    y_pred_df.index = dict_data[route_data][int(0.8*dict_data[route_data].shape[0]):].index
    y_pred_out = y_pred_df["Predictions"] 
    arma_rmse = np.sqrt(mean_squared_error(dict_data[route_data][int(0.8*dict_data[route_data].shape[0]):].values, y_pred_df["Predictions"]))
    results.append((arma_rmse))
  dict_results[i] = np.array(results).mean()

In [215]:
dict_results

{(0, 0, 1): 0.6686682067839711,
 (0, 1, 0): 0.9503125783587812,
 (0, 1, 1): 0.7285743118214949,
 (0, 1, 2): 0.7244727824780699,
 (1, 0, 0): 0.6676787093756729,
 (1, 0, 1): 0.6677622493330241,
 (1, 1, 0): 0.8645841999682614,
 (1, 1, 1): 0.7201857895277232,
 (1, 1, 2): 0.7311834624430609,
 (1, 2, 1): 1.1240881580641378,
 (2, 1, 1): 0.7355543049259994,
 (2, 1, 5): 0.7559343759558306,
 (2, 2, 1): 1.1500229503183537}

In [None]:
# We thus conclude that the tuple (1, 0, 0) gives us the best rmse score on the training set which is 0.6676.
# We will now try to forecast using this optimal (p,d,q) value in order to forecast the testdata.

for route_data in dict_data :
  SARIMAXmodel = SARIMAX(dict_data[route_data], order = (1, 0, 0) , seasonal_order=(1,1,2,4))
  SARIMAXmodel = SARIMAXmodel.fit()
  y_pred = SARIMAXmodel.get_forecast(dict_test[route_data].shape[0])
  y_pred_df = y_pred.conf_int(alpha = 0.05) 
  y_pred_df["Predictions"] = SARIMAXmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
  y_pred_df.index = dict_test[route_data].index
  y_pred_out = y_pred_df["Predictions"] 
  dict_test[route_data]["Predictions"] = list(y_pred_out)
  

In [55]:
test_prod = pd.read_csv("/content/flights_Xtest.csv",index_col ="flight_date")
test_prod["ROUTE"] = test_prod["from"] +"-" + test_prod["to"]
test_prod["key"] = test_prod["from"] +"-" + test_prod["to"] + test_prod.index
test_prod

Unnamed: 0_level_0,from,to,avg_weeks,std_weeks,ROUTE,key
flight_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-22,ATL,MCO,10.363636,8.232025,ATL-MCO,ATL-MCO2013-01-22
2013-02-22,ATL,DEN,8.294118,5.542616,ATL-DEN,ATL-DEN2013-02-22
2013-01-31,ORD,EWR,8.933333,9.407191,ORD-EWR,ORD-EWR2013-01-31
2012-11-15,DFW,ORD,9.900000,6.608607,DFW-ORD,DFW-ORD2012-11-15
2013-02-13,SEA,SFO,7.000000,4.472136,SEA-SFO,SEA-SFO2013-02-13
...,...,...,...,...,...,...
2013-01-30,SEA,ORD,10.055556,7.336229,SEA-ORD,SEA-ORD2013-01-30
2013-01-03,MCO,EWR,15.310345,10.525830,MCO-EWR,MCO-EWR2013-01-03
2013-01-13,PHL,DFW,8.529412,6.022018,PHL-DFW,PHL-DFW2013-01-13
2012-12-23,PHL,ATL,9.315789,6.289786,PHL-ATL,PHL-ATL2012-12-23


In [59]:
result = pd.concat(dict_test)

In [65]:
production = pd.merge(test_prod, result, on='key')[["Predictions_y"]]

In [68]:
production.to_csv("prod.csv",header = False, index = False)