<a href="https://colab.research.google.com/github/MbogoriL/time-series-analysis-forecasting/blob/main/Time_Series_Forecasting_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem Statement

Sweet Lift Taxi company has collected historical data on taxi orders at airports. To attract more
drivers during peak hours, we need to predict the number of taxi orders for the next hour. Build a
model for such a prediction.



# Metric of Success

The RMSE metric on the test set should not be more than 48.

# Data Importation

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("https://bit.ly/3p1QPAv", index_col=[0], parse_dates=[0])
data.head()

Unnamed: 0_level_0,num_orders
datetime,Unnamed: 1_level_1
2018-03-01 00:00:00,9
2018-03-01 00:10:00,14
2018-03-01 00:20:00,28
2018-03-01 00:30:00,20
2018-03-01 00:40:00,32


In [None]:
data.shape

(26496, 2)

In [None]:
data.dtypes

datetime      datetime64[ns]
num_orders             int64
dtype: object

In [None]:
#make features
def make_features(data):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek


make_features(data)
print(data.head())

                     num_orders  year  month  day  dayofweek
datetime                                                    
2018-03-01 00:00:00           9  2018      3    1          3
2018-03-01 00:10:00          14  2018      3    1          3
2018-03-01 00:20:00          28  2018      3    1          3
2018-03-01 00:30:00          20  2018      3    1          3
2018-03-01 00:40:00          32  2018      3    1          3


In [None]:
#make additional features for rolling mean and lag
def make_features(data, max_lag, rolling_mean_size):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek

    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['num_orders'].shift(lag)

    data['rolling_mean'] = (
        data['num_orders'].shift().rolling(rolling_mean_size).mean()
    )


make_features(data, 4, 4)
print(data.head())

                     num_orders  year  month  day  dayofweek  lag_1  lag_2  \
datetime                                                                     
2018-03-01 00:00:00           9  2018      3    1          3    NaN    NaN   
2018-03-01 00:10:00          14  2018      3    1          3    9.0    NaN   
2018-03-01 00:20:00          28  2018      3    1          3   14.0    9.0   
2018-03-01 00:30:00          20  2018      3    1          3   28.0   14.0   
2018-03-01 00:40:00          32  2018      3    1          3   20.0   28.0   

                     lag_3  lag_4  rolling_mean  
datetime                                         
2018-03-01 00:00:00    NaN    NaN           NaN  
2018-03-01 00:10:00    NaN    NaN           NaN  
2018-03-01 00:20:00    NaN    NaN           NaN  
2018-03-01 00:30:00    9.0    NaN           NaN  
2018-03-01 00:40:00   14.0    9.0         17.75  


In [None]:
#split dataset into training and test set
from sklearn.model_selection import train_test_split

data.sort_index(inplace=True)
data = data.resample('1D').sum()

train, test = train_test_split(data, shuffle=False, test_size=0.2)
train = train.dropna()

print(train.index.min(), train.index.max())
print(test.index.min(), test.index.max())

2018-03-01 00:00:00 2018-07-25 00:00:00
2018-07-26 00:00:00 2018-08-31 00:00:00


In [None]:
#import libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#split dataset into features and target
features_train = train.drop(['num_orders'], axis=1)
target_train = train['num_orders']
features_test = test.drop(['num_orders'], axis=1)
target_test = test['num_orders']

#create instance of ML model
model = LinearRegression()

#train model
model.fit(features_train, target_train)

#predictions
pred_train = model.predict(features_train)
pred_test = model.predict(features_test)

#evaluate model performance
print(
    'RMSE for the training set:', (mean_squared_error(target_train, pred_train)**0.5)
)
print('RMSE for the test set:', (mean_squared_error(target_test, pred_test)**0.5))


RMSE for the training set: 9.443673465344261
RMSE for the test set: 12.931413406986138
