## Define business goal 
We are trying to predict number of visitors for next year. We have data from the mall with unknown features and weather forecast
Data presented for each day, we will use regression model to predict the results

Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
def print_scores(y, y_pred):
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y,y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    print('RMSE: ',rmse)
    print('R2: ',r2)
    print('MAPE: ',mape)

Plotting functions

In [3]:
plt.rcParams['figure.figsize'] = (14,6)
def plot(df, title):
    df.plot()
    plt.title(title)
    plt.ylabel('label')
    plt.show()

Load data

In [4]:
df=pd.read_csv('data/data_train.csv', index_col=0, parse_dates=True)

In [5]:
df.index = pd.to_datetime(df.index)

In [7]:
df_w=pd.read_csv('data/weather.csv', index_col=0, parse_dates=True)

In [8]:
df_w.index = pd.to_datetime(df_w.index)

In [9]:
df=df.join(df_w)

In [10]:
df = df.drop(['feature_8', 'feature_9', 'feature_3','feature_4', 'feature_5', 'air_temperature_daily_mean', 'air_temperature_daily_min'], axis = 1)

We will replace Nan values with mean values for the beginning, but its better to replace with data from another year for this time period

In [11]:
df.loc[df['air_humidity'].isna()==True, 'air_humidity']=df['air_humidity'].mean()
df.loc[df['air_temperature_daily_max'].isna()==True, 'air_temperature_daily_max']=df['air_temperature_daily_max'].mean()
df.loc[df['sunshine_hours'].isna()==True, 'sunshine_hours']=df['sunshine_hours'].mean()
df.loc[df['wind_speed_max'].isna()==True, 'wind_speed_max']=df['wind_speed_max'].mean()

Feature Engineering

In [12]:
df['year']=df.index.year
df['month']=df.index.month
df['week_day']=df.index.dayofweek
df['day']=df.index.day
df['step'] = list(range(len(df)))

In [13]:
df['date'] = df.index

In [14]:
df.columns

Index(['bank_holiday', 'feature_0', 'feature_1', 'feature_2', 'feature_6',
       'feature_7', 'school_holiday', 'feature_10', 'label', 'air_humidity',
       'air_temperature_daily_max', 'precipitation', 'snow_height',
       'sunshine_hours', 'wind_speed_max', 'year', 'month', 'week_day', 'day',
       'step', 'date'],
      dtype='object')

In [15]:
df = df.join(pd.get_dummies(df['month'], prefix='m', drop_first=True))

In [16]:
df = df.join(pd.get_dummies(df['week_day'], prefix='d', drop_first=True))

In [30]:
X = df.drop('label', axis = 1)
y = df['label']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [33]:
X_train_S = X_train[['step', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9',
       'm_10', 'm_11', 'm_12', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6']]
X_test_S = X_test[['step', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9',
       'm_10', 'm_11', 'm_12', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6']]

In [34]:
lg=LinearRegression()
lg.fit(X_train_S, y_train)
y_pred_test = lg.predict(X_test_S)
y_pred_train = lg.predict(X_train_S)

In [35]:
print('Test')
print_scores(y_test, y_pred_test)
print('Train')
print_scores(y_train, y_pred_train)

Test
RMSE:  442.7491836448369
R2:  0.5001580628846594
MAPE:  0.3546310045974737
Train
RMSE:  431.7782846339404
R2:  0.5040729023655375
MAPE:  0.3484233775228002


In [25]:
df.columns

Index(['bank_holiday', 'feature_0', 'feature_1', 'feature_2', 'feature_6',
       'feature_7', 'school_holiday', 'feature_10', 'label', 'air_humidity',
       'air_temperature_daily_max', 'precipitation', 'snow_height',
       'sunshine_hours', 'wind_speed_max', 'year', 'month', 'week_day', 'day',
       'step', 'date', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9',
       'm_10', 'm_11', 'm_12', 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6',
       'predict', 'remainder'],
      dtype='object')

In [36]:
X_train_R = X_train[['bank_holiday', 'feature_0', 'feature_1', 'feature_2', 'feature_6',
       'feature_7', 'school_holiday', 'feature_10', 'air_humidity',
       'air_temperature_daily_max', 'precipitation', 'snow_height',
       'sunshine_hours', 'wind_speed_max']]
X_test_R = X_test[['bank_holiday', 'feature_0', 'feature_1', 'feature_2', 'feature_6',
       'feature_7', 'school_holiday', 'feature_10', 'air_humidity',
       'air_temperature_daily_max', 'precipitation', 'snow_height',
       'sunshine_hours', 'wind_speed_max']]

In [37]:
y_train_R = y_train - y_pred_train
y_test_R = y_test - y_pred_test

In [38]:
rf=RandomForestRegressor(max_features = 5, max_depth = 12, n_estimators = 250)
rf.fit(X_train_R, y_train_R)
y_pred_test_R = rf.predict(X_test_R)
y_pred_train_R = rf.predict(X_train_R)

In [39]:
print('Test')
print_scores(y_test_R, y_pred_test_R)
print('Train')
print_scores(y_train_R, y_pred_train_R)

Test
RMSE:  350.8065676128561
R2:  0.3721984647181239
MAPE:  1.8177102432905312
Train
RMSE:  217.35346399718716
R2:  0.746597124507906
MAPE:  2.809160051021871


In [40]:
print(y_test_R, y_pred_test_R)

date
2005-10-08   -648.154089
2008-10-16    449.834414
2005-11-05    -67.799173
2006-11-24    205.398568
2007-12-19   -332.156408
                 ...    
2008-10-11   -289.317031
2007-03-15   -479.503361
2009-06-14   -994.584891
2005-04-28   -337.785731
2007-01-18   -602.744240
Name: label, Length: 436, dtype: float64 [-1.53896527e+02  2.46041441e+02 -1.63621967e+02 -1.40179861e+02
 -2.08330775e+02  2.65712683e+02 -1.41343656e+02 -1.93294849e+02
 -1.96687134e+02  3.28450289e+01 -1.19972192e+02  3.00920653e+01
  6.31182202e+01 -1.49815355e+00 -1.73689548e+02 -9.69549020e+01
 -1.75291263e+02  4.72347177e+02  1.98721077e+02 -4.21599650e+01
 -1.08618369e+02  1.19541072e+02 -1.98273522e+02  8.26812273e+01
 -6.23039580e+01  9.09075879e+01 -1.33566089e+02 -1.17278293e+02
 -1.73866613e+02 -1.10257162e+01  2.78785300e+01 -9.38453568e+01
  4.39513012e+01  4.63900305e+01 -5.44047484e+01  2.49059289e+02
 -1.44215299e+02  6.51569278e+02  1.51087731e+02 -9.96074300e+00
  4.54534076e+01 -1.66073440e

In [41]:
res_test = y_pred_test_R + y_pred_test
res_train = y_pred_train_R + y_pred_train

In [42]:
print('Test')
print_scores(y_test, res_test)
print('Train')
print_scores(y_train, res_train)

Test
RMSE:  350.8065676128561
R2:  0.6862002570061644
MAPE:  0.2621697583793379
Train
RMSE:  217.35346399718716
R2:  0.8743306474249788
MAPE:  0.17411555571850557
