In [30]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.svm import SVC,SVR
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [6]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [7]:
#reading the dataset
df = pd.read_csv('D:\\i126\\train.csv')
display(df.head())
display(df.tail())

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


Unnamed: 0,date,store,item,sales
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62
912999,2017-12-31,10,50,82


In [8]:
#checking shape
df.shape

(913000, 4)

In [9]:
#checking datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB


In [10]:
df.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


In [11]:
#converting object-dtype to int-dtype
parts = df["date"].str.split("-", n = 3, expand = True)
df["year"]= parts[0].astype('int')
df["month"]= parts[1].astype('int')
df["day"]= parts[2].astype('int')
df.head()


Unnamed: 0,date,store,item,sales,year,month,day
0,2013-01-01,1,1,13,2013,1,1
1,2013-01-02,1,1,11,2013,1,2
2,2013-01-03,1,1,14,2013,1,3
3,2013-01-04,1,1,13,2013,1,4
4,2013-01-05,1,1,10,2013,1,5


In [12]:
#adding column to check if the day is weekend or not if weekend 1 else 0
from datetime import datetime
import calendar

def weekend_or_weekday(year,month,day):
    d = datetime(year,month,day)
    if d.weekday()>4:
        return 1
    else:
        return 0

df['weekend'] = df.apply(lambda x:weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
df.head()


Unnamed: 0,date,store,item,sales,year,month,day,weekend
0,2013-01-01,1,1,13,2013,1,1,0
1,2013-01-02,1,1,11,2013,1,2,0
2,2013-01-03,1,1,14,2013,1,3,0
3,2013-01-04,1,1,13,2013,1,4,0
4,2013-01-05,1,1,10,2013,1,5,1


In [13]:
#adding column for holiday 
from datetime import date
import holidays

def is_holiday(x):

    india_holidays = holidays.country_holidays('IN')

    if india_holidays.get(x):
        return 1
    else:
        return 0

df['holidays'] = df['date'].apply(is_holiday)
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays
0,2013-01-01,1,1,13,2013,1,1,0,0
1,2013-01-02,1,1,11,2013,1,2,0,0
2,2013-01-03,1,1,14,2013,1,3,0,0
3,2013-01-04,1,1,13,2013,1,4,0,0
4,2013-01-05,1,1,10,2013,1,5,1,0


In [14]:
!pip install holidays

Defaulting to user installation because normal site-packages is not writeable


In [15]:
#checking if it is a weekday using weekend function 
def which_day(year, month, day):

    d = datetime(year,month,day)
    return d.weekday()

df['weekday'] = df.apply(lambda x: which_day(x['year'],x['month'],x['day']),axis=1)
df.head()


Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays,weekday
0,2013-01-01,1,1,13,2013,1,1,0,0,1
1,2013-01-02,1,1,11,2013,1,2,0,0,2
2,2013-01-03,1,1,14,2013,1,3,0,0,3
3,2013-01-04,1,1,13,2013,1,4,0,0,4
4,2013-01-05,1,1,10,2013,1,5,1,0,5


In [16]:
#converting date to a periodic value so it can be considered
df['m1'] = np.sin(df['month'] * (2 * np.pi / 12))
df['m2'] = np.cos(df['month'] * (2 * np.pi / 12))
df.head()


Unnamed: 0,date,store,item,sales,year,month,day,weekend,holidays,weekday,m1,m2
0,2013-01-01,1,1,13,2013,1,1,0,0,1,0.5,0.866025
1,2013-01-02,1,1,11,2013,1,2,0,0,2,0.5,0.866025
2,2013-01-03,1,1,14,2013,1,3,0,0,3,0.5,0.866025
3,2013-01-04,1,1,13,2013,1,4,0,0,4,0.5,0.866025
4,2013-01-05,1,1,10,2013,1,5,1,0,5,0.5,0.866025


In [17]:
df.drop('date', axis=1, inplace=True)

In [18]:
max(df['sales'])

231

In [19]:
df['store'].nunique(), df['item'].nunique()


(10, 50)

In [20]:
features = ['store', 'year', 'month','weekday', 'weekend', 'holidays']

In [21]:
#reducing outliers
df = df[df['sales']<140]

In [22]:
features = df.drop(['sales', 'year'], axis=1)
target = df['sales'].values
#splitting the data for training and validation

X_train, X_val, Y_train, Y_val = train_test_split(features, target,test_size = 0.05,random_state=22)
X_train.shape, X_val.shape


((861170, 9), (45325, 9))

In [23]:
# Normalizing the features fast training.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [35]:
models = [LinearRegression(), XGBRegressor(), Lasso(), Ridge(), RandomForestRegressor
         ()]

for i in range(len(models)):
    models[i].fit(X_train, Y_train)
    train_preds = models[i].predict(X_train)
    print(f'{models[i]} : ')
    print('Training Accuracy : ', metrics.r2_score(Y_train,train_preds))
    val_preds = models[i].predict(X_val)
    print('Validation Accuracy : ', metrics.r2_score(Y_val, val_preds))
    print()
   
  

LinearRegression() : 
Training Accuracy :  0.13848576060106454
Validation Accuracy :  0.13771923295623167

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) : 
Training Accuracy :  0.893443577091406
Validation Accuracy :  0.8934673030228729

Lasso() : 
Training Accuracy :  0.13247318262097763
Validatio

In [27]:
from sklearn.ensemble import RandomForestRegressor
mod=RandomForestRegressor()
mod.fit(X_train, Y_train)
train_preds = mod.predict(X_train)
val_preds = mod.predict(X_val)
metrics.r2_score(Y_train,train_preds),metrics.r2_score(Y_val, val_preds)

(0.9826982384366114, 0.8776328975581018)

In [36]:
from sklearn.ensemble import RandomForestRegressor
mod=XGBRegressor()
mod.fit(X_train, Y_train)
train_preds = mod.predict(X_train)
val_preds = mod.predict(X_val)
metrics.r2_score(Y_train,train_preds),metrics.r2_score(Y_val, val_preds)

(0.893443577091406, 0.8934673030228729)

In [50]:
import pickle
pickle.dump(mod , open('D:\\i126\\mod.pkl', 'wb'))
XGBRegressor(objective="reg:squarederror", missing=1, seed=42)
loaded_model = pickle.load(open('D:\\i126\\mod.pkl', 'rb'))

In [55]:
loaded_model.predict([[1,1,1,3,0,0,3,5.000000e-01,0.866025]])

array([46.77719], dtype=float32)

In [47]:
features

Unnamed: 0,store,item,month,day,weekend,holidays,weekday,m1,m2
0,1,1,1,1,0,0,1,5.000000e-01,0.866025
1,1,1,1,2,0,0,2,5.000000e-01,0.866025
2,1,1,1,3,0,0,3,5.000000e-01,0.866025
3,1,1,1,4,0,0,4,5.000000e-01,0.866025
4,1,1,1,5,1,0,5,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...
912995,10,50,12,27,0,0,2,-2.449294e-16,1.000000
912996,10,50,12,28,0,0,3,-2.449294e-16,1.000000
912997,10,50,12,29,0,0,4,-2.449294e-16,1.000000
912998,10,50,12,30,1,0,5,-2.449294e-16,1.000000


In [86]:
predict=['2013-01-01',1,1]
p={

        "date": predict[0],
        "store":predict[1],
        "item":predict[2]
    }

pre=pd.DataFrame([p])
pre
parts = pre["date"].str.split("-", n = 3, expand = True)
pre["year"]= parts[0].astype('int')
pre["month"]= parts[1].astype('int')
pre["day"]= parts[2].astype('int')
pre['weekend'] = pre.apply(lambda x:weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
pre['holidays'] = pre['date'].apply(is_holiday)
pre['weekday'] = pre.apply(lambda x: which_day(x['year'],x['month'],x['day']),axis=1)
pre['m1'] = np.sin(pre['month'] * (2 * np.pi / 12))
pre['m2'] = np.cos(pre['month'] * (2 * np.pi / 12))
list(pre.iloc[-1,:])
pre.drop(['date','year'], axis=1, inplace=True)
loaded_model.predict(pre.iloc[-1:,:])

array([43.089012], dtype=float32)