In [1]:
__author__ = "konwar.m"
__copyright__ = "Copyright 2022, AI R&D"
__credits__ = ["konwar.m"]
__license__ = "Individual Ownership"
__version__ = "1.0.1"
__maintainer__ = "konwar.m"
__email__ = "rickykonwar@gmail.com"
__status__ = "Development"

### Importing Libraries

In [2]:
import os
import copy
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
os.chdir('..')
os.getcwd()

'c:\\Users\\manash.jyoti.konwar\\Documents\\AI_Random_Projects\\ML-Retail-Sales'

### Loading Training Data and Features

In [4]:
finalized_train_dict, finalized_test_dict = {}, {}

In [5]:
train_test_path = r'datasets\train_test_datasets_v02'
files = os.listdir(train_test_path)

training_files = [i for i in files if i.endswith('.csv') and i.startswith('train')]
testing_files = [i for i in files if i.endswith('.csv') and i.startswith('test')]

with open(os.path.join('datasets', 'train_test_datasets_v02', 'parent_cat_map.pkl'), "rb") as fp:
    parent_category_map = pickle.load(fp)

parent_category_inv_map = {v: k for k, v in parent_category_map.items()}

In [6]:
parent_category_inv_map

{0: 'Accessories',
 1: 'Android Games',
 2: 'Books',
 3: 'Cinema',
 4: 'Clean Media (Piece)',
 5: 'Clean Media (Spire)',
 6: 'Delivery Of Goods',
 7: 'Film',
 8: 'Games',
 9: 'Gaming Consoles',
 10: 'Gifts',
 11: 'Mac Games',
 12: 'Music',
 13: 'Pc',
 14: 'Pc Games',
 15: 'Payment Cards (Cinema, Music, Games)',
 16: 'Payment Cards',
 17: 'Power Elements',
 18: 'Programs',
 19: 'Service',
 20: 'Tickets (Digit)',
 21: 'Игры'}

In [7]:
for train_file in tqdm(training_files, desc='Loading training files'):
    finalized_train_dict[parent_category_inv_map.get(int(train_file.split('_')[1]))] = pd.read_csv(os.path.join('datasets','train_test_datasets_v02',train_file))

for test_file in tqdm(testing_files, desc='Loading testing files'):
    finalized_test_dict[parent_category_inv_map.get(int(train_file.split('_')[1]))] = pd.read_csv(os.path.join('datasets','train_test_datasets_v02',test_file))    

Loading training files: 100%|██████████| 21/21 [00:15<00:00,  1.39it/s]
Loading testing files: 100%|██████████| 21/21 [00:02<00:00,  7.91it/s]


### Preparing baseline data

In [8]:
def prepare_data(**kwargs):
    train_test_data = kwargs.get('train_test_data')
    baseline_features = kwargs.get('baseline_features')
    group_name, group_id = kwargs.get('group_info').get('group_name'), kwargs.get('group_info').get('group_id')

    # Extracting dynamic price ratio features
    item_category_features = list(train_test_data.columns)[list(train_test_data.columns).index('priceratio_parent_category')+1:]

    # Overall features
    baseline_features = baseline_features[:-1] + item_category_features + [baseline_features[-1]]

    # For training: rows having date_block_num from 0 to 31 is utilized for training the model
    train_time_range_lo = (train_test_data['date_block_num'] >= 0)
    train_time_range_hi =  (train_test_data['date_block_num'] <= 32)

    # For validation: rows having date_block_num as 33
    validation_time =  (train_test_data['date_block_num'] == 33)

    # Retrieve rows for train set, val set, test set
    cv_trainset = train_test_data[train_time_range_lo & train_time_range_hi]
    cv_valset = train_test_data[validation_time]
    cv_trainset = cv_trainset[baseline_features]
    cv_valset = cv_valset[baseline_features]

    print('Extracting training data started')
    # Prepare numpy arrays for training/val/test
    cv_trainset_vals = cv_trainset.values.astype(int)
    trainx = cv_trainset_vals[:, 0:len(baseline_features) - 1]
    trainy = cv_trainset_vals[:, len(baseline_features) - 1]
    print('Extracting training data ended')

    print('Extracting validation data started')
    cv_valset_vals = cv_valset.values.astype(int)
    valx = cv_valset_vals[:, 0:len(baseline_features) - 1]
    valy = cv_valset_vals[:, len(baseline_features) - 1]
    print('Extracting validation data ended')

    print('Extracting features dataframe started')
    df_features = pd.DataFrame(baseline_features[:-1], columns=['feature_name'])
    df_features['group_name'] = group_name
    df_features['group_id'] = group_id
    print('Extracting features dataframe ended')

    return trainx, trainy, valx, valy, df_features

In [9]:
baseline_features = ['date_block_num', 'item_id', 'shop_id', 'item_price', 'item_category_id', 
    'count_item_week_shop', 'price_mean_week_category', 'price_lag_item_1', 'price_lag_item_4', 'price_lag_item_12', 
    'price_lag_item_24', 'price_lag_item_shop_1', 'price_lag_item_shop_4', 'price_lag_item_shop_12', 'price_lag_item_shop_24', 
    'week_block_num', 'month', 'seasonal_index', 'priceratio_parent_category', 'item_cnt_day']

finalized_modelling_dict = {}

for parent_cat in finalized_train_dict.keys():
    print(parent_cat)

    train_x_w_price, train_y_w_price, val_x_w_price, val_y_w_price, df_features = prepare_data(
                                                                                    train_test_data=finalized_train_dict.get(parent_cat),
                                                                                    baseline_features=baseline_features,
                                                                                    group_info={'group_name':parent_cat, 'group_id':parent_category_map.get(parent_cat)},
                                                                                )
    if parent_cat not in finalized_modelling_dict.keys():
        finalized_modelling_dict[parent_cat] = {
                                                'train_x': train_x_w_price,
                                                'train_y': train_y_w_price,
                                                'val_x': val_x_w_price,
                                                'val_y': val_y_w_price,
                                                'features_df': df_features
                                            }
    print(train_x_w_price.shape, train_y_w_price.shape, val_x_w_price.shape, val_y_w_price.shape, df_features.shape)

Accessories
Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting features dataframe started
Extracting features dataframe ended
(43576, 24) (43576,) (1121, 24) (1121,) (24, 3)
Gifts
Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting features dataframe started
Extracting features dataframe ended
(155946, 26) (155946,) (5268, 26) (5268,) (26, 3)
Mac Games
Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting features dataframe started
Extracting features dataframe ended
(7, 20) (7,) (1, 20) (1,) (20, 3)
Music
Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting features dataframe started
Extracting features dataframe ended
(368381, 2

### Extracting features dataframe

#### Forming base features dataframe

In [10]:
df_features = pd.DataFrame()

if len(finalized_modelling_dict.keys())>0:
    for parent_cat in tqdm(finalized_modelling_dict.keys(), desc='Generating features dataframe'):
        df_features =  finalized_modelling_dict.get(parent_cat).get('features_df') if df_features.shape[0] == 0 else pd.concat([df_features, finalized_modelling_dict.get(parent_cat).get('features_df')], ignore_index=True).reset_index(drop=True)

df_features

Generating features dataframe: 100%|██████████| 21/21 [00:00<00:00, 778.18it/s]


Unnamed: 0,feature_name,group_name,group_id
0,date_block_num,Accessories,0
1,item_id,Accessories,0
2,shop_id,Accessories,0
3,item_price,Accessories,0
4,item_category_id,Accessories,0
...,...,...,...
469,priceratio_item_category_39,Gaming Consoles,9
470,priceratio_item_category_40,Gaming Consoles,9
471,priceratio_item_category_41,Gaming Consoles,9
472,priceratio_item_category_42,Gaming Consoles,9


#### Adding feature types

In [11]:
df_features = df_features.drop(columns=['feature_type']) if 'feature_type' in df_features.columns else df_features
df_features['feature_type'] = np.nan

for row_index, row_data in df_features.copy().iterrows():
    feature_list = row_data.feature_name.split('_') 
    
    feature_type = 'generic'
    if 'id' in feature_list:
        feature_type = 'id'
    elif 'price' in feature_list and 'lag' in feature_list:
        feature_type = 'lag price'
    elif 'price' in feature_list:
        feature_type = 'price'
    elif 'priceratio' in feature_list:
        feature_type = 'price ratio'
    elif 'seasonal' in feature_list:
        feature_type = 'seasonality'
    
    df_features.iloc[row_index, df_features.columns.get_loc('feature_type')] = feature_type

df_features

Unnamed: 0,feature_name,group_name,group_id,feature_type
0,date_block_num,Accessories,0,generic
1,item_id,Accessories,0,id
2,shop_id,Accessories,0,id
3,item_price,Accessories,0,price
4,item_category_id,Accessories,0,id
...,...,...,...,...
469,priceratio_item_category_39,Gaming Consoles,9,price ratio
470,priceratio_item_category_40,Gaming Consoles,9,price ratio
471,priceratio_item_category_41,Gaming Consoles,9,price ratio
472,priceratio_item_category_42,Gaming Consoles,9,price ratio
