In [1]:
#impoting libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#import dataset
train       = pd.read_csv('sales_train.csv')
test        = pd.read_csv('test.csv')
submission  = pd.read_csv('sample_submission.csv')
items       = pd.read_csv('items.csv')
item_cats   = pd.read_csv('item_categories.csv')
shops       = pd.read_csv('shops.csv')
#%%
#X = train.iloc[:, :-1].values
#Y = train.iloc[:,-1].values

In [3]:
#Down casts the data entries from int64 to int32 and float64 to float32
#This reduces the size of the records by almost half. (From 134mb to 61mb)
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [4]:
#Calls the downcasting function
train = downcast_dtypes(train)

In [5]:
# Manual Feature engineering
# grouped for visual representation

# group data by month and shop_id, return sum of items sold per shop per month
month_group = pd.DataFrame(train.groupby(['date_block_num', 'shop_id'])['item_cnt_day'].sum().reset_index())

# added the item_category into sales_train
merged = pd.merge(train, items[['item_id', 'item_category_id']], on = 'item_id')

# group data by month and category_id, return sum of items sold per category per month
category_group = pd.DataFrame(merged.groupby(['date_block_num', 'item_category_id'])['item_cnt_day'].sum().reset_index())

In [6]:
print(f'{month_group.head(5)}\n{merged.head(5)}\n{category_group.head(5)}')

   date_block_num  shop_id  item_cnt_day
0               0        0        5578.0
1               0        1        2947.0
2               0        2        1146.0
3               0        3         767.0
4               0        4        2114.0
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day  \
0  02.01.2013               0       59    22154       999.0           1.0   
1  23.01.2013               0       24    22154       999.0           1.0   
2  20.01.2013               0       27    22154       999.0           1.0   
3  02.01.2013               0       25    22154       999.0           1.0   
4  03.01.2013               0       25    22154       999.0           1.0   

   item_category_id  
0                37  
1                37  
2                37  
3                37  
4                37  
   date_block_num  item_category_id  item_cnt_day
0               0                 0           1.0
1               0                 1           1.0
2        

In [7]:
# grouping for training model

# added the item_category into sales_train
merged2 = pd.merge(train, items[['item_id', 'item_category_id']], on = 'item_id')

# group data by month and shop_id, return sum of items sold per shop per month
# grouped by price as specials could result in higher sales
month_group2 = pd.DataFrame(merged2.groupby(['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price'])['item_cnt_day'].sum().reset_index())



In [8]:
month_group2.head(50)

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day
0,0,0,2,5572,1322.0,10.0
1,0,0,2,5573,560.0,1.0
2,0,0,2,5575,806.0,4.0
3,0,0,2,5576,2231.0,5.0
4,0,0,2,5609,2381.0,1.0
5,0,0,2,5612,3623.0,1.0
6,0,0,2,5623,294.0,1.0
7,0,0,2,5627,2060.0,2.0
8,0,0,2,5629,1925.0,9.0
9,0,0,2,5630,2060.0,1.0


In [9]:
# nominal intergers can not be converted to binary encoding, convert to string
month_group2['date_block_num'] = [('month ' + str(i)) for i in month_group2['date_block_num']]
month_group2['shop_id'] = [('shop ' + str(i)) for i in month_group2['shop_id']]
month_group2['item_category_id'] = [('item_category ' + str(i)) for i in month_group2['item_category_id']]
month_group2['item_id'] = [('item ' + str(i)) for i in month_group2['item_id']]

In [10]:
month_group2.head(10)

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day
0,month 0,shop 0,item_category 2,item 5572,1322.0,10.0
1,month 0,shop 0,item_category 2,item 5573,560.0,1.0
2,month 0,shop 0,item_category 2,item 5575,806.0,4.0
3,month 0,shop 0,item_category 2,item 5576,2231.0,5.0
4,month 0,shop 0,item_category 2,item 5609,2381.0,1.0
5,month 0,shop 0,item_category 2,item 5612,3623.0,1.0
6,month 0,shop 0,item_category 2,item 5623,294.0,1.0
7,month 0,shop 0,item_category 2,item 5627,2060.0,2.0
8,month 0,shop 0,item_category 2,item 5629,1925.0,9.0
9,month 0,shop 0,item_category 2,item 5630,2060.0,1.0


In [11]:
# break into X and Y, where X is the inputs and Y is our output
X = month_group2.iloc[:, :-1].values
Y = month_group2.iloc[:,-1].values

In [12]:
# Encoding categorical data
# used to provide a value to data that can be then used in equations, eg. Friday = 1 etc.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# the only variable which is not categorical is item_price, hence all other variable will be converted
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2, 3])], remainder = 'passthrough')
#X = np.array(ct.fit_transform(X), dtype=object)
X = ct.fit_transform(X)
# convert back to 2-D representation of the matrix from sparse matrix representation 
#X = X.todense()

# ***Need to find a way to remove dummy variable to avoid dummy var trap***

In [13]:
# split data set into training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size = 0.2, random_state = 0)

In [14]:
# fitting multiple linear regression to the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Things to do!!!
#****Will need to predict on the testing data to compare results

LinearRegression()

In [15]:
#z = ['month 0', 'shop 0', 'item_category 2', 'item 5572', 1322]

# A simple program to see how the predictor would work

from scipy import stats
def auto_predictor():
    global ct
    month = input('Please enter the month you want to predict. Enter the digit of the month, eg. Jan = 0 : ')
    shop = input('Please enter the shop ID, eg. 2 : ')
    item = input("Please enter the item ID you wish to predict, eg. for item 55, enter '55' : ")
    item_cat = (items.loc[items['item_id'] == int(item), ['item_category_id']].values)[0][0]
    prices = train.loc[train['item_id'] == int(item), ['item_price']].values
    price = (stats.mode(prices))[0][0][0]
    
    z = ['month ' + str(month), 'shop ' + str(shop), 'item_category ' + str(item_cat), 'item ' + str(item), price]
    z = np.array(z, dtype = object).reshape(1, -1)
    z = ct.transform(z)
    z_pred = regressor.predict(z)
    
    return(z_pred[0])

# Things to check!!!
# ***There are multiple prices available for the item in that specific shop, provide an option to choose price

In [16]:
auto_predictor()

Please enter the month you want to predict. Enter the digit of the month, eg. Jan = 0 : 0
Please enter the shop ID, eg. 2 : 0
Please enter the item ID you wish to predict, eg. for item 55, enter '55' : 5629


2.841782529439241