# Homework №0
Goal - prediction by day for the week ahead


Choose the level of aggregation (day, week, month, year), as well as the aggregation function (average, sum, maximum).

## Settings

In [1]:
# Bibs
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np
import matplotlib.dates as mdates
import seaborn as sns
import matplotlib

import plotly.express as px

from plotly.graph_objects import *
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)



# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(15, 10)})
sns.set(font_scale=2)
sns.set_style("whitegrid")

## plot_test_vs_predictions

In [2]:
def plot_test_vs_predictions(data, true_col, pred_col):
    '''
    Function plots true data points vs predictions
    
    Parameters
    ----------
    data:  DataFrame
           input data frame
    true_col: str
              column of true data points
    pred_col: str
              columns of predictions
            
    '''
    true_line = Scatter(x = data.index, y = data[true_col],
                       name = 'True data points')
    pred_line = Scatter(x = data.index, y = data[pred_col],
                       name = 'Predictions')
    
    mae = np.round(mean_absolute_error(data[true_col], data[pred_col]),2)
    mse = np.round(mean_squared_error(data[true_col], data[pred_col]),2)
    R2 = np.round(r2_score(data[true_col], data[pred_col]),2)
    
    data = [true_line, pred_line]
    layout = Layout(title = f'MAE={mae} MSE={mse} R2={R2},')
    iplot(Figure(data = data,
                layout = layout ))

In [3]:
# Dataset
df_bike = pd.read_csv(r"C:\Users\user\Documents\DataAnalyse\TimeSeriasForecasting\Datasets\bike-sharing.csv")

    instant: record index
    dteday : date
    season : season (1:winter, 2:spring, 3:summer, 4:fall)
    yr : year (0: 2011, 1:2012)
    mnth : month ( 1 to 12)
    hr : hour (0 to 23)
    holiday : weather day is holiday or not
    weekday : day of the week
    workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
    weathersit :
    1: Clear, Few clouds, Partly cloudy, Partly cloudy
    2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
    temp : Normalized temperature in Celsius. The values are derived via (t-tmin)/(tmax-tmin), tmin=-8, t_max=+39 (only in hourly scale)
    atemp: Normalized feeling temperature in Celsius. The values are derived via (t-tmin)/(tmax-tmin), tmin=-16, t_max=+50 (only in hourly scale)
    hum: Normalized humidity. The values are divided to 100 (max)
    windspeed: Normalized wind speed. The values are divided to 67 (max)
    casual: count of casual users
    registered: count of registered users
    cnt: count of total rental bikes including both casual and registered

In [4]:
df_bike.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,9.84,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,9.02,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,9.02,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,9.84,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,9.84,0.2879,0.75,0.0,0,1,1


In [5]:
df = df_bike[['dteday','cnt']].copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   dteday  17379 non-null  object
 1   cnt     17379 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 271.7+ KB


In [7]:
df['dteday'] = pd.to_datetime(df['dteday'])

In [8]:
df['dteday']

0       2011-01-01
1       2011-01-01
2       2011-01-01
3       2011-01-01
4       2011-01-01
           ...    
17374   2012-12-31
17375   2012-12-31
17376   2012-12-31
17377   2012-12-31
17378   2012-12-31
Name: dteday, Length: 17379, dtype: datetime64[ns]

In [9]:
df['dteday'].min(), df['dteday'].max()

(Timestamp('2011-01-01 00:00:00'), Timestamp('2012-12-31 00:00:00'))

In [10]:
df.set_index('dteday',inplace=True)

In [11]:
df_daily = df.resample('D').sum()
df_daily.shape

(731, 1)

In [12]:
df_weekly = df.resample('W').sum()
df_weekly.shape

(106, 1)

In [13]:
df_monthly = df.resample('M').sum()
df_monthly.shape

(24, 1)

In [14]:
fig = px.line(df_daily, x=df_daily.index, y='cnt')
fig.update_layout(title='Number of bikes per day')
fig.show()

In [15]:
fig = px.line(df_weekly, x=df_weekly.index, y='cnt')
fig.update_layout(title='Number of bikes per week')
fig.show()

In [16]:
fig = px.line(df_monthly, x=df_monthly.index, y='cnt')
fig.update_layout(title='Number of bikes per month')
fig.show()

# Splitting train, test

In [17]:
df_daily.shape

(731, 1)

In [18]:
df_daily

Unnamed: 0_level_0,cnt
dteday,Unnamed: 1_level_1
2011-01-01,985
2011-01-02,801
2011-01-03,1349
2011-01-04,1562
2011-01-05,1600
...,...
2012-12-27,2114
2012-12-28,3095
2012-12-29,1341
2012-12-30,1796


In [19]:
train, test = df_daily[:724], df_daily[-7:]

In [20]:
train.shape, test.shape

((724, 1), (7, 1))

# Pred1 .  Average value for the previous observation period

In [21]:
test = test.assign(pred1 = train['cnt'].mean())

In [22]:
test

Unnamed: 0_level_0,cnt,pred1
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-12-25,1013,4530.593923
2012-12-26,441,4530.593923
2012-12-27,2114,4530.593923
2012-12-28,3095,4530.593923
2012-12-29,1341,4530.593923
2012-12-30,1796,4530.593923
2012-12-31,2729,4530.593923


In [23]:
plot_test_vs_predictions(test, 'cnt','pred1')

# Pred2. Average value for the last month of observations

In [24]:
test = test.assign(pred2 = train[-30:]['cnt'].mean())

In [25]:
test

Unnamed: 0_level_0,cnt,pred1,pred2
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-12-25,1013,4530.593923,4630.166667
2012-12-26,441,4530.593923,4630.166667
2012-12-27,2114,4530.593923,4630.166667
2012-12-28,3095,4530.593923,4630.166667
2012-12-29,1341,4530.593923,4630.166667
2012-12-30,1796,4530.593923,4630.166667
2012-12-31,2729,4530.593923,4630.166667


In [26]:
plot_test_vs_predictions(test, 'cnt','pred2')

# Pred3. Average for the last week of observations 

In [27]:
test = test.assign(pred3 = train[-7:]['cnt'].mean())

In [28]:
train[-7:]['cnt'].mean()

3290.1428571428573

In [29]:
test

Unnamed: 0_level_0,cnt,pred1,pred2,pred3
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-12-25,1013,4530.593923,4630.166667,3290.142857
2012-12-26,441,4530.593923,4630.166667,3290.142857
2012-12-27,2114,4530.593923,4630.166667,3290.142857
2012-12-28,3095,4530.593923,4630.166667,3290.142857
2012-12-29,1341,4530.593923,4630.166667,3290.142857
2012-12-30,1796,4530.593923,4630.166667,3290.142857
2012-12-31,2729,4530.593923,4630.166667,3290.142857


In [30]:
plot_test_vs_predictions(test, 'cnt','pred3')

# Pred4. The last value of the observations

In [31]:
test = test.assign(pred4 = train['cnt'].values[-1])

In [32]:
test

Unnamed: 0_level_0,cnt,pred1,pred2,pred3,pred4
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-12-25,1013,4530.593923,4630.166667,3290.142857,920
2012-12-26,441,4530.593923,4630.166667,3290.142857,920
2012-12-27,2114,4530.593923,4630.166667,3290.142857,920
2012-12-28,3095,4530.593923,4630.166667,3290.142857,920
2012-12-29,1341,4530.593923,4630.166667,3290.142857,920
2012-12-30,1796,4530.593923,4630.166667,3290.142857,920
2012-12-31,2729,4530.593923,4630.166667,3290.142857,920


In [33]:
plot_test_vs_predictions(test, 'cnt', 'pred4')

# Pred5. Shifting 1 Week

In [34]:
df_daily['pred5'] = df_daily['cnt'].shift()


In [35]:
train, test = df_daily[:724], df_daily[-7:]

In [36]:
test

Unnamed: 0_level_0,cnt,pred5
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-12-25,1013,920.0
2012-12-26,441,1013.0
2012-12-27,2114,441.0
2012-12-28,3095,2114.0
2012-12-29,1341,3095.0
2012-12-30,1796,1341.0
2012-12-31,2729,1796.0


In [37]:
plot_test_vs_predictions(test, 'cnt', 'pred5')

# Pred6. Shiftting 1 Year

In [38]:
df_daily['pred6'] = df_daily['cnt'].shift(365)

In [39]:
train, test = df_daily[:724], df_daily[-7:]

In [40]:
plot_test_vs_predictions(test, 'cnt', 'pred6')

# Pred7. Average value per Weekday

In [41]:
df = df_bike[['dteday','weekday','cnt']].copy()

In [42]:
df = df.groupby(['dteday','weekday'],as_index=False)['cnt'].sum()

In [43]:
train, test = df[:724], df[-7:]

In [44]:
test

Unnamed: 0,dteday,weekday,cnt
724,2012-12-25,2,1013
725,2012-12-26,3,441
726,2012-12-27,4,2114
727,2012-12-28,5,3095
728,2012-12-29,6,1341
729,2012-12-30,0,1796
730,2012-12-31,1,2729


In [45]:
avg_cnt = train.groupby('weekday')['cnt'].mean()
avg_cnt

weekday
0    4252.221154
1    4353.596154
2    4544.621359
3    4588.417476
4    4692.048544
5    4705.776699
6    4581.403846
Name: cnt, dtype: float64

In [46]:
test = test.assign(pred7 = test['weekday'].map(avg_cnt)) 

In [47]:
test

Unnamed: 0,dteday,weekday,cnt,pred7
724,2012-12-25,2,1013,4544.621359
725,2012-12-26,3,441,4588.417476
726,2012-12-27,4,2114,4692.048544
727,2012-12-28,5,3095,4705.776699
728,2012-12-29,6,1341,4581.403846
729,2012-12-30,0,1796,4252.221154
730,2012-12-31,1,2729,4353.596154


In [48]:
test

Unnamed: 0,dteday,weekday,cnt,pred7
724,2012-12-25,2,1013,4544.621359
725,2012-12-26,3,441,4588.417476
726,2012-12-27,4,2114,4692.048544
727,2012-12-28,5,3095,4705.776699
728,2012-12-29,6,1341,4581.403846
729,2012-12-30,0,1796,4252.221154
730,2012-12-31,1,2729,4353.596154


In [49]:
plot_test_vs_predictions(test,'cnt','pred7')

In [50]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))

In [51]:
test['scaled_true_val'] = scaler.fit_transform(test[['cnt']])
test['scaled_preds'] = scaler.fit_transform(test[['pred7']])

In [52]:
plot_test_vs_predictions(test, 
                        'scaled_true_val',
                        'scaled_preds')

# Pred8. The smoothing window

In [53]:
train.set_index('dteday', inplace=True)

In [54]:
roll_mean = train.rolling(5).mean()[-7:]
roll_mean

Unnamed: 0_level_0,weekday,cnt
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-12-18,2.8,4917.2
2012-12-19,2.4,4848.4
2012-12-20,2.0,4664.6
2012-12-21,3.0,4632.0
2012-12-22,4.0,4064.8
2012-12-23,3.6,3310.8
2012-12-24,3.2,2441.4


In [55]:
test = test.assign(pred8 = roll_mean['cnt'].values)

In [56]:
plot_test_vs_predictions(test, 'cnt','pred8')