# Predict on real data

In [1]:
import pandas as pd

In [2]:
# Import real data
real_data = pd.read_csv('REAL_DATA.csv')

In [3]:
real_data.head()

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
0,272371,415,7,01/03/2015,0,0,0,0,0
1,558468,27,7,29/12/2013,0,0,0,0,0
2,76950,404,3,19/03/2014,657,1,1,0,0
3,77556,683,2,29/01/2013,862,1,0,0,0
4,456344,920,3,19/03/2014,591,1,1,0,0


In [4]:
# Remove unecessary columns
real_data = real_data.drop(['index', 'store_ID', 'school_holiday', 'state_holiday'], axis=1)

In [5]:
real_data.head()

Unnamed: 0,day_of_week,date,nb_customers_on_day,open,promotion
0,7,01/03/2015,0,0,0
1,7,29/12/2013,0,0,0
2,3,19/03/2014,657,1,1
3,2,29/01/2013,862,1,0
4,3,19/03/2014,591,1,1


In [6]:
# Convert date to datetime object
real_data['date'] = pd.to_datetime(real_data['date'], dayfirst=True)

# Extract date features from real data dataframe
real_data['year'] = real_data['date'].dt.year
real_data['month'] = real_data['date'].dt.month
real_data['day'] = real_data['date'].dt.day
real_data['week of year'] = real_data['date'].dt.isocalendar().week

In [7]:
real_data.head()

Unnamed: 0,day_of_week,date,nb_customers_on_day,open,promotion,year,month,day,week of year
0,7,2015-03-01,0,0,0,2015,3,1,9
1,7,2013-12-29,0,0,0,2013,12,29,52
2,3,2014-03-19,657,1,1,2014,3,19,12
3,2,2013-01-29,862,1,0,2013,1,29,5
4,3,2014-03-19,591,1,1,2014,3,19,12


In [8]:
real_data = real_data.drop('date', axis=1)

In [9]:
real_data.head()

Unnamed: 0,day_of_week,nb_customers_on_day,open,promotion,year,month,day,week of year
0,7,0,0,0,2015,3,1,9
1,7,0,0,0,2013,12,29,52
2,3,657,1,1,2014,3,19,12
3,2,862,1,0,2013,1,29,5
4,3,591,1,1,2014,3,19,12


To predict the sales column, we are using the best performing model from all of the previously tested models. In this case, it would be XGBoost Regressor which has an R2 score of 82%

In [10]:
# Load saved model
import joblib
saved_xgboost_regressor_model = joblib.load('xgboost_model.pkl')

In [11]:
# Make predictions
real_data['sales'] = saved_xgboost_regressor_model.predict(real_data)

In [13]:
real_data.head(10)

Unnamed: 0,day_of_week,nb_customers_on_day,open,promotion,year,month,day,week of year,sales
0,7,0,0,0,2015,3,1,9,0.090951
1,7,0,0,0,2013,12,29,52,0.009818
2,3,657,1,1,2014,3,19,12,6554.337402
3,2,862,1,0,2013,1,29,5,6561.789062
4,3,591,1,1,2014,3,19,12,6074.740723
5,4,569,1,0,2014,6,26,26,5010.26709
6,1,321,1,1,2015,2,16,8,3903.546631
7,6,1367,1,0,2014,11,22,47,10057.952148
8,4,0,0,1,2015,6,4,23,7.125002
9,2,546,1,1,2015,1,13,3,6226.002441
