In [1]:
import pandas as pd
import numpy as np #For mathematical caculations
import matplotlib.pyplot as plt #For plotting graphs
from datetime import datetime # To access datetime
from pandas import Series # To work on series

In [2]:
%matplotlib inline

In [3]:
import warnings # To ignore the warnings
warnings.filterwarnings("ignore")

In [4]:
train=pd.read_csv("Train_SU63ISt.csv")  #Reading the file 

In [5]:
test=pd.read_csv("Test_0qrQsBZ.csv") #Reading the file

In [6]:
train_original=train.copy()
test_original=test.copy()

In [7]:
train.columns

Index(['ID', 'Datetime', 'Count'], dtype='object')

In [8]:
test.columns

Index(['ID', 'Datetime'], dtype='object')

In [9]:
train.dtypes

ID           int64
Datetime    object
Count        int64
dtype: object

In [10]:
test.dtypes

ID           int64
Datetime    object
dtype: object

In [11]:
train.shape

(18288, 3)

In [12]:
test.shape

(5112, 2)

In [13]:
train.head()

Unnamed: 0,ID,Datetime,Count
0,0,25-08-2012 00:00,8
1,1,25-08-2012 01:00,2
2,2,25-08-2012 02:00,6
3,3,25-08-2012 03:00,2
4,4,25-08-2012 04:00,2


In [14]:
test.head()

Unnamed: 0,ID,Datetime
0,18288,26-09-2014 00:00
1,18289,26-09-2014 01:00
2,18290,26-09-2014 02:00
3,18291,26-09-2014 03:00
4,18292,26-09-2014 04:00


# Feature Extraction

In [None]:
train['Datetime'] = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M') 
test['Datetime'] = pd.to_datetime(test.Datetime,format='%d-%m-%Y %H:%M') 
test_original['Datetime'] = pd.to_datetime(test_original.Datetime,format='%d-%m-%Y %H:%M')
train_original['Datetime'] = pd.to_datetime(train_original.Datetime,format='%d-%m-%Y %H:%M')

In [None]:
for i in (train, test, test_original, train_original):
    i['year']=i.Datetime.dt.year 
    i['month']=i.Datetime.dt.month 
    i['day']=i.Datetime.dt.day
    i['Hour']=i.Datetime.dt.hour 

In [None]:
train.head()

In [None]:
train['day of week']=train['Datetime'].dt.dayofweek
temp = train['Datetime']

In [None]:
train.head()

In [None]:
# 1 if the day of week is a weekend and 0 if the day of week in not a weekend.
def applyer(row):
    if row.dayofweek == 5 or row.dayofweek == 6:
        return 1
    else:
        return 0

In [None]:
temp2 = train['Datetime'].apply(applyer)
train['weekend']=temp2

In [None]:
train.index = train['Datetime'] # indexing the Datetime to get the time period on the x-axis.
df=train.drop('ID',1)           # drop ID variable to get only the Datetime on x-axis.
ts = df['Count']
plt.figure(figsize=(16,8))
plt.plot(ts, label='Passenger Count')
plt.title('Time Series')
plt.xlabel("Time(year-month)")
plt.ylabel("Passenger count")
plt.legend(loc='best')

Here we can infer that there is an increasing trend in the series, i.e., the number of count is increasing with respect to time. We can also see that at certain points there is a sudden increase in the number of counts. The possible reason behind this could be that on particular day, due to some event the traffic was high.


In [None]:
train.groupby('year')['Count'].mean().plot.bar()

In [None]:
train.groupby('month')['Count'].mean().plot.bar()

In [None]:
train.groupby(['year', 'month'])['Count'].mean()

In [None]:
temp=train.groupby(['year', 'month'])['Count'].mean()
temp.plot(figsize=(15,5), title= 'Passenger Count(Monthwise)', fontsize=14)

In [None]:
train.groupby('day')['Count'].mean().plot.bar()

In [None]:
train.groupby('Hour')['Count'].mean().plot.bar()

In [None]:
train.groupby('weekend')['Count'].mean().plot.bar()


In [None]:
train.groupby('day of week')['Count'].mean().plot.bar()

In [None]:
train=train.drop('ID',1)

In [None]:
train.head()

In [None]:
train.Timestamp = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M') 
train.index = train.Timestamp

# Hourly time series
hourly = train.resample('H').mean()

# Converting to daily mean
daily = train.resample('D').mean()

# Converting to weekly mean
weekly = train.resample('W').mean()

# Converting to monthly mean
monthly = train.resample('M').mean()

In [None]:
fig, axs = plt.subplots(4,1)

hourly.Count.plot(figsize=(15,8), title= 'Hourly', fontsize=14, ax=axs[0])
daily.Count.plot(figsize=(15,8), title= 'Daily', fontsize=14, ax=axs[1])
weekly.Count.plot(figsize=(15,8), title= 'Weekly', fontsize=14, ax=axs[2])
monthly.Count.plot(figsize=(15,8), title= 'Monthly', fontsize=14, ax=axs[3])

plt.show()

In [None]:
test.Timestamp = pd.to_datetime(test.Datetime,format='%d-%m-%Y %H:%M') 
test.index = test.Timestamp 

# Converting to daily mean
test = test.resample('D').mean()

train.Timestamp = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M') 
train.index = train.Timestamp

# Converting to daily mean
train = train.resample('D').mean()

In [None]:
train.head()

In [None]:
daily.head()

In [None]:
test.head()

# Splitting the data into training and Validation part

In [None]:
Train=train.ix['2012-08-25':'2014-06-24']
valid=train.ix['2014-06-25':'2014-09-25']

In [None]:
Train.Count.plot(figsize=(15,8), title= 'Daily Ridership', fontsize=14, label='train')
valid.Count.plot(figsize=(15,8), title= 'Daily Ridership', fontsize=14, label='valid')
plt.xlabel("Datetime")
plt.ylabel("Passenger count")
plt.legend(loc='best')
plt.show()

# Modelling Techniques

## Navie Approach

In [None]:
vals = np.asarray(Train.Count)
y_hat = valid.copy()
y_hat['naive'] = vals[len(vals)-1]

In [None]:
y_hat.naive.head()

In [None]:
dd= np.asarray(Train.Count)
y_hat = valid.copy()
y_hat['naive'] = dd[len(dd)-1]
plt.figure(figsize=(12,8))
plt.plot(Train.index, Train['Count'], label='Train')
plt.plot(valid.index,valid['Count'], label='Valid')
plt.plot(y_hat.index,y_hat['naive'], label='Naive Forecast')
plt.legend(loc='best')
plt.title("Naive Forecast")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(valid.Count, y_hat.naive))
print(rms)

## Moving Average

## avg of last 10 observations

In [None]:
y_hat_avg_10 = valid.copy()
y_hat_avg_10['moving_avg_forecast'] = Train['Count'].rolling(10).mean().iloc[-1] # average of last 10 observations.
plt.figure(figsize=(15, 5))
plt.plot(Train['Count'], label='Training')
plt.plot(valid['Count'], label='Validation')
plt.plot(y_hat_avg_10['moving_avg_forecast'], label='moving avg 10')
plt.legend(loc='best')
plt.show()

## avg of last 20 observations

In [None]:
y_hat_avg_20 = valid.copy()
y_hat_avg_20['moving_avg_forecast'] = Train['Count'].rolling(20).mean().iloc[-1] # average of last 20 observations.
plt.figure(figsize=(15, 5))
plt.plot(Train['Count'], label='Training')
plt.plot(valid['Count'], label='Validation')
plt.plot(y_hat_avg_20['moving_avg_forecast'], label='moving avg 20')
plt.legend(loc='best')
plt.show()

## avg of last 50 observations

In [None]:
y_hat_avg_50 = valid.copy()
y_hat_avg_50['moving_avg_forecast'] = Train['Count'].rolling(50).mean().iloc[-1]
plt.figure(figsize=(15, 5))
plt.plot(Train['Count'], label='Training')
plt.plot(valid['Count'], label='Validation')
plt.plot(y_hat_avg_50['moving_avg_forecast'], label='moving avg 50')
plt.legend(loc='best')
plt.show()

In [None]:
rms = sqrt(mean_squared_error(valid.Count, y_hat_avg_50.moving_avg_forecast))
print(rms)

In [None]:
rms = sqrt(mean_squared_error(valid.Count, y_hat_avg_10.moving_avg_forecast))
print(rms)

## Simple Exponential Smoothing

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

In [None]:
y_hat_avg = valid.copy()
fit2 = SimpleExpSmoothing(np.asarray(Train['Count'])).fit(smoothing_level=0.6,optimized=False)
y_hat_avg['SES'] = fit2.forecast(len(valid))
plt.figure(figsize=(16,8))
plt.plot(Train['Count'], label='Train')
plt.plot(valid['Count'], label='Valid')
plt.plot(y_hat_avg['SES'], label='SES')
plt.legend(loc='best')
plt.show()

## Holt’s Linear Trend Model

In [None]:
import statsmodels.api as sm
sm.tsa.seasonal_decompose(Train.Count).plot()
result = sm.tsa.stattools.adfuller(train.Count)
plt.show()

In [None]:
result

In [None]:
y_hat_avg = valid.copy()

fit1 = Holt(np.asarray(Train['Count'])).fit(smoothing_level = 0.3,smoothing_slope = 0.1)
y_hat_avg['Holt_linear'] = fit1.forecast(len(valid))

plt.figure(figsize=(16,8))
plt.plot(Train['Count'], label='Train')
plt.plot(valid['Count'], label='Valid')
plt.plot(y_hat_avg['Holt_linear'], label='Holt_linear')
plt.legend(loc='best')
plt.show()

In [None]:
rms = sqrt(mean_squared_error(valid.Count, y_hat_avg.Holt_linear))
print(rms)

## holt winter model

In [None]:
y_hat_avg_winter = valid.copy()
fit3 = ExponentialSmoothing(np.asarray(Train['Count']), seasonal_periods=7, trend='add', seasonal='add').fit()

In [None]:
fit3

In [None]:
y_hat_avg_winter['Holt_winter'] = fit3.forecast(len(valid))
print(y_hat_winter['Holt_winter'].tolist())

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(Train['Count'], label='Training')
plt.plot(valid['Count'], label='Validation')
plt.plot(y_hat_avg_winter['Holt_winter'], label='Holt winter')
plt.legend(loc='best')
plt.show()

In [None]:
rms = sqrt(mean_squared_error(valid.Count, y_hat_avg_winter.Holt_winter))
print(rms)

In [None]:
predict=fit1.forecast(len(test))

In [None]:
print(predict)

In [None]:
test['prediction']=predict