In [1]:
__author__ = "konwar.m"
__copyright__ = "Copyright 2022, AI R&D"
__credits__ = ["konwar.m"]
__license__ = "Individual Ownership"
__version__ = "1.0.1"
__maintainer__ = "konwar.m"
__email__ = "rickykonwar@gmail.com"
__status__ = "Development"

### Importing Libraries

In [2]:
import os
import numpy as np
import pandas as pd

from datetime import timedelta
from sklearn import preprocessing

In [3]:
os.chdir('..')
os.getcwd()

'c:\\Users\\manash.jyoti.konwar\\Documents\\AI_Random_Projects\\ML-Retail-Sales'

### Loading Data

In [4]:
sales_train = pd.read_csv(r'datasets\sales_train.csv')
items = pd.read_csv(r'datasets\translated_items.csv')
shops = pd.read_csv(r'datasets\translated_shops.csv')
item_categories = pd.read_csv(r'datasets\translated_item_categories.csv')
test = pd.read_csv(r'datasets\test.csv')
sample_submission = pd.read_csv(r'datasets\sample_submission.csv')

### Aggregation of data

In [5]:
# Getting weekly dates instead of daily dates
train_data = sales_train.copy()
train_data['date'] = pd.to_datetime(train_data['date'], infer_datetime_format=True, format='%d.%m.%Y')
train_data['week_start_date'] = train_data.date.apply(lambda x: x - timedelta(days=x.weekday()))
train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,week_start_date
0,2013-01-02,0,59,22154,999.0,1.0,2012-12-31
1,2013-01-03,0,25,2552,899.0,1.0,2012-12-31
2,2013-01-05,0,25,2552,899.0,-1.0,2012-12-31
3,2013-01-06,0,25,2554,1709.05,1.0,2012-12-31
4,2013-01-15,0,25,2555,1099.0,1.0,2013-01-14


In [6]:
# Merging category ids
train_data = pd.merge(train_data, items[['item_id', 'item_category_id']], how='left', on='item_id')
train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,week_start_date,item_category_id
0,2013-01-02,0,59,22154,999.0,1.0,2012-12-31,37
1,2013-01-03,0,25,2552,899.0,1.0,2012-12-31,58
2,2013-01-05,0,25,2552,899.0,-1.0,2012-12-31,58
3,2013-01-06,0,25,2554,1709.05,1.0,2012-12-31,58
4,2013-01-15,0,25,2555,1099.0,1.0,2013-01-14,56


### Data Preprocessing

In [7]:
# Using Label Encoder to encode the item categories and use them with training set data
lb = preprocessing.LabelEncoder()
l_cat = list(item_categories.translated_item_category_name)

item_categories['item_category_id_fix'] = lb.fit_transform(l_cat)
item_categories['item_category_name_fix'] = l_cat
train_data = train_data.merge(item_categories[['item_category_id', 'item_category_id_fix']], on = 'item_category_id', how = 'left')
_ = train_data.drop(['item_category_id'], axis=1, inplace=True)
train_data.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)

_ = item_categories.drop(['item_category_id'],axis=1, inplace=True)
_ = item_categories.drop(['item_category_name'],axis=1, inplace=True)
_ = item_categories.drop(['translated_item_category_name'],axis=1, inplace=True)

item_categories.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)
item_categories.rename(columns = {'item_category_name_fix':'item_category_name'}, inplace = True)
item_categories = item_categories.drop_duplicates()
item_categories.index = np.arange(0, len(item_categories))
item_categories = item_categories.sort_values(by=['item_category_id']).reset_index(drop=True)
item_categories

Unnamed: 0,item_category_id,item_category_name
0,0,Accessories - PS2.
1,1,Accessories - PS3.
2,2,Accessories - PS4.
3,3,Accessories - PSP.
4,4,Accessories - Xbox 360
...,...,...
79,79,Programs - training
80,80,Service
81,81,Service - Tickets
82,82,Tickets (digit)


In [8]:
# Removing categories with very less data (data points less than 10)
category_2_remove = []
for category in train_data.item_category_id.unique():
    data_shape = train_data.loc[train_data.item_category_id.isin([category])].shape
    if data_shape[0]<10:
        category_2_remove.append(category)
print(category_2_remove)

train_data = train_data.loc[~train_data.item_category_id.isin(category_2_remove)].sort_values(by=['week_start_date','item_category_id']).reset_index(drop=True)
train_data.head()

[63, 0, 14, 36, 19, 15, 12, 56, 20]


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,week_start_date,item_category_id
0,2013-01-02,0,25,5629,2390.0,1.0,2012-12-31,1
1,2013-01-05,0,25,5632,2390.0,1.0,2012-12-31,1
2,2013-01-02,0,25,5651,5490.0,-1.0,2012-12-31,1
3,2013-01-04,0,25,5627,2390.0,1.0,2012-12-31,1
4,2013-01-05,0,25,5575,550.01,1.0,2012-12-31,1


In [9]:
# Clipping data within 0 and 20
train_data['item_cnt_day'] = train_data['item_cnt_day'].clip(0,20)
train_data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,week_start_date,item_category_id
0,2013-01-02,0,25,5629,2390.0,1.0,2012-12-31,1
1,2013-01-05,0,25,5632,2390.0,1.0,2012-12-31,1
2,2013-01-02,0,25,5651,5490.0,0.0,2012-12-31,1
3,2013-01-04,0,25,5627,2390.0,1.0,2012-12-31,1
4,2013-01-05,0,25,5575,550.01,1.0,2012-12-31,1
