In [1]:
__author__ = "konwar.m"
__copyright__ = "Copyright 2022, AI R&D"
__credits__ = ["konwar.m"]
__license__ = "Individual Ownership"
__version__ = "1.0.1"
__maintainer__ = "konwar.m"
__email__ = "rickykonwar@gmail.com"
__status__ = "Development"

In [2]:
# Importing Libraries
import os
import gc
import time
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb

from numpy import sqrt 
from sklearn.metrics import mean_squared_error

  from pandas import MultiIndex, Int64Index


### Importing Libraries

In [3]:
os.chdir('..')
os.getcwd()

'c:\\Users\\manash.jyoti.konwar\\Documents\\AI_Random_Projects\\ML-Retail-Sales'

### Loading Training Data and Features

In [4]:
train_test_set = pd.read_csv(r'datasets\training_datasets\trainset.csv')
with open(r'datasets\training_datasets\new_features.pkl', 'rb') as feature_file:
    new_features = pickle.load(feature_file)

In [5]:
print(train_test_set.shape)
print(len(new_features))

(11128050, 81)
75


In [6]:
train_test_set.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price,item_cnt_month,item_category_id,prev_shopitem_sales_1,prev_shopitem_sales_2,prev_shopitem_sales_3,prev_shopitem_sales_4,...,item_name_tfidf_15,item_name_tfidf_16,item_name_tfidf_17,item_name_tfidf_18,item_name_tfidf_19,item_name_tfidf_20,item_name_tfidf_21,item_name_tfidf_22,item_name_tfidf_23,item_name_tfidf_24
0,59,22154,0,999.0,1,28,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,22154,0,999.0,5,28,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24,22154,0,999.0,1,28,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,23,22154,0,702.9322,0,28,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,22154,0,702.9322,0,28,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preparing baseline data

In [7]:
def prepare_data(**kwargs):
    train_test_data = kwargs.get('train_test_data')
    baseline_features = kwargs.get('baseline_features')
    
    # For training: rows having date_block_num from 0 to 31 is utilized for training the model
    train_time_range_lo = (train_test_data['date_block_num'] >= 0)
    train_time_range_hi =  (train_test_data['date_block_num'] <= 32)

    # For validation: rows having date_block_num as 33
    validation_time =  (train_test_data['date_block_num'] == 33)

    # For testing: rows having date_block_num from 34
    test_time =  (train_test_data['date_block_num'] == 34)

    # Retrieve rows for train set, val set, test set
    cv_trainset = train_test_data[train_time_range_lo & train_time_range_hi]
    cv_valset = train_test_data[validation_time]
    cv_trainset = cv_trainset[baseline_features]
    cv_valset = cv_valset[baseline_features]
    testset = train_test_data[test_time]
    testset = testset[baseline_features]

    print('Extracting training data started')
    # Prepare numpy arrays for training/val/test
    cv_trainset_vals = cv_trainset.values.astype(int)
    trainx = cv_trainset_vals[:, 0:len(baseline_features) - 1]
    trainy = cv_trainset_vals[:, len(baseline_features) - 1]
    print('Extracting training data ended')

    print('Extracting validation data started')
    cv_valset_vals = cv_valset.values.astype(int)
    valx = cv_valset_vals[:, 0:len(baseline_features) - 1]
    valy = cv_valset_vals[:, len(baseline_features) - 1]
    print('Extracting validation data ended')

    print('Extracting testing data started')
    testset_vals = testset.values.astype(int)
    testx = testset_vals[:, 0:len(baseline_features) - 1]
    print('Extracting testing data ended')

    return trainx, trainy, valx, valy, testx

### Training XG Boost Regressor

In [8]:
def train_xgboost(**kwargs):
    train_x = kwargs.get('train_x')
    train_y = kwargs.get('train_y')
    val_x = kwargs.get('val_x')
    val_y = kwargs.get('val_y')
    test_x = kwargs.get('test_x')
    perform_test_predictions = kwargs.get('perform_test_predictions')

    current = time.time()
    print('Fitting xg boost...')

    model = xgb.XGBRegressor(max_depth = 11, min_child_weight=0.5, subsample = 1, eta = 0.3, num_round = 1000, seed = 1, nthread = 4)
    model.fit(train_x, train_y, eval_metric='rmse')

    preds = model.predict(val_x)
    # Clipping to range 0-20
    preds = np.clip(preds, 0,20)
    print('val set rmse: ', sqrt(mean_squared_error(val_y, preds)))

    if perform_test_predictions:
        preds = model.predict(test_x)
        # Clipping to range 0-20
        preds = np.clip(preds, 0,20)
        df = pd.DataFrame(preds, columns = ['item_cnt_month'])
        df['ID'] = df.index
        df = df.set_index('ID')
        print('test predictions written to file')
        end = time.time()
        diff = end - current
        print('Took ' + str(int(diff)) + ' seconds to train and predict val, test set')
        return model, df
    else:
        end = time.time()
        diff = end - current
        print('Took ' + str(int(diff)) + ' seconds to train and predict val, test set')
        return model, pd.DataFrame(columns = ['item_cnt_month'])

In [9]:
baseline_features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num'] +  new_features + ['item_cnt_month']
train_x_wo_price, train_y_wo_price, val_x_wo_price, val_y_wo_price, test_x_wo_price = prepare_data(
                                                                                        train_test_data=train_test_set,
                                                                                        baseline_features=baseline_features
                                                                                    )

Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting testing data started
Extracting testing data ended


In [10]:
print(train_x_wo_price.shape)
train_x_wo_price

(10675678, 79)


array([[   59, 22154,    28, ...,     0,     0,     0],
       [   25, 22154,    28, ...,     0,     0,     0],
       [   24, 22154,    28, ...,     0,     0,     0],
       ...,
       [   34, 21937,    55, ...,     0,     0,     0],
       [   26, 21937,    55, ...,     0,     0,     0],
       [   27, 21937,    55, ...,     0,     0,     0]])

In [11]:
if not os.path.exists(os.path.join('models_outputs', 'model_trained_wo_price_xgb.pkl')):
    model_wo_price, test_predictions = train_xgboost(
                                        train_x = train_x_wo_price,
                                        train_y = train_y_wo_price,
                                        val_x = val_x_wo_price,
                                        val_y = val_y_wo_price,
                                        test_x = test_x_wo_price,
                                        perform_test_predictions = True
                                    )
    with open(os.path.join('models_outputs', 'model_trained_wo_price_xgb.pkl'), 'wb') as model_file:
        pickle.dump(model_wo_price, model_file)

Fitting xg boost...
Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


val set rmse:  1.079887443149056
test predictions written to file
Took 7728 seconds to train and predict val, test set


In [12]:
del [[train_x_wo_price, train_y_wo_price, val_x_wo_price, val_y_wo_price, test_x_wo_price]]
gc.collect()

25

In [13]:
baseline_features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_price'] +  new_features + ['item_cnt_month']
train_x_w_price, train_y_w_price, val_x_w_price, val_y_w_price, test_x_w_price = prepare_data(
                                                                                        train_test_data=train_test_set,
                                                                                        baseline_features=baseline_features
                                                                                    )

Extracting training data started
Extracting training data ended
Extracting validation data started
Extracting validation data ended
Extracting testing data started
Extracting testing data ended


In [14]:
print(train_x_w_price.shape)
train_x_w_price

(10675678, 80)


array([[   59, 22154,    28, ...,     0,     0,     0],
       [   25, 22154,    28, ...,     0,     0,     0],
       [   24, 22154,    28, ...,     0,     0,     0],
       ...,
       [   34, 21937,    55, ...,     0,     0,     0],
       [   26, 21937,    55, ...,     0,     0,     0],
       [   27, 21937,    55, ...,     0,     0,     0]])

In [15]:
if not os.path.exists(os.path.join('models_outputs', 'model_trained_w_price_xgb.pkl')):
    model_w_price, test_predictions = train_xgboost(
                                        train_x = train_x_w_price,
                                        train_y = train_y_w_price,
                                        val_x = val_x_w_price,
                                        val_y = val_y_w_price,
                                        test_x = test_x_w_price,
                                        perform_test_predictions = False
                                    )
    with open(os.path.join('models_outputs', 'model_trained_w_price_xgb.pkl'), 'wb') as model_file:
        pickle.dump(model_w_price, model_file)

Fitting xg boost...
Parameters: { "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


val set rmse:  1.0324293291666302
Took 6782 seconds to train and predict val, test set


In [16]:
del [[train_x_w_price, train_y_w_price, val_x_w_price, val_y_w_price, test_x_w_price]]
gc.collect()

94