### Importing Libraries

In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing
from xgboost import plot_tree
from matplotlib import pyplot

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
os.chdir('..')
os.getcwd()

### Loading Data

In [None]:
sales_train = pd.read_csv(r'datasets\sales_train.csv')
items = pd.read_csv(r'datasets\translated_items.csv')
shops = pd.read_csv(r'datasets\translated_shops.csv')
item_categories = pd.read_csv(r'datasets\translated_item_categories.csv')
test = pd.read_csv(r'datasets\test.csv')
sample_submission = pd.read_csv(r'datasets\sample_submission.csv')

### Aggregation of data 

In [None]:
# Create a dataframe grid which is based on shop and item id combinations and is arranged based on 
grid = []
for block_num in sales_train['date_block_num'].unique():
    cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)
grid

In [None]:
# Aggregations are done to convert daily sales to month level
sales_train['item_cnt_day'] = sales_train['item_cnt_day'].clip(0,20)
groups = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'])
trainset = groups.agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
trainset = trainset.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
trainset['item_cnt_month'] = trainset['item_cnt_month'].clip(0,20)
trainset

In [None]:
trainset = pd.merge(grid,trainset,how='left',on=index_cols)
trainset.item_cnt_month = trainset.item_cnt_month.fillna(0)
trainset

In [None]:
# Get category id
trainset = pd.merge(trainset, items[['item_id', 'item_category_id']], on = 'item_id')
trainset

### Feature Engineering

In [None]:
# Set seeds and options
np.random.seed(10)
pd.set_option('display.max_rows', 231)
pd.set_option('display.max_columns', 100)

In [None]:
# Feature engineering list
new_features = []
enable_feature_idea = [True, True, True, True, True, True, True, True, True, True]

# Some parameters(maybe add more periods, score will be better) [1,2,3,12]
lookback_range = [1,2,3,4,5,6,7,8,9,10,11,12]

tqdm.pandas()

# Use recent data
start_month_index = trainset.date_block_num.min()
end_month_index = trainset.date_block_num.max()

In [None]:
current = time.time()

trainset = trainset[['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_price', 'item_cnt_month']]
trainset = trainset[(trainset.date_block_num >= start_month_index) & (trainset.date_block_num <= end_month_index)]

print('Loading test set...')
test_dataset = loadtxt(r'datasets\test.csv', delimiter="," ,skiprows=1, usecols = (1,2), dtype=int)
testset = pd.DataFrame(test_dataset, columns = ['shop_id', 'item_id'])

print('Merging with other datasets...')
# Get item category id into test_df
testset = testset.merge(items[['item_id', 'item_category_id']], on = 'item_id', how = 'left')
testset['date_block_num'] = 34
# Make testset contains same column as trainset so we can concatenate them row-wise
testset['item_cnt_month'] = -1
testset

In [None]:
train_test_set = pd.concat([trainset, testset], axis = 0) 

end = time.time()
diff = end - current
print('Took ' + str(int(diff)) + ' seconds to train and predict val set')

In [None]:
# Using Label Encoder to encode the item categories and use them with training set data
lb = preprocessing.LabelEncoder()
l_cat = list(item_categories.translated_item_category_name)

In [None]:
item_categories['item_category_id_fix'] = lb.fit_transform(l_cat)
item_categories['item_category_name_fix'] = l_cat
train_test_set = train_test_set.merge(item_categories[['item_category_id', 'item_category_id_fix']], on = 'item_category_id', how = 'left')
_ = train_test_set.drop(['item_category_id'],axis=1, inplace=True)
train_test_set.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)

_ = item_categories.drop(['item_category_id'],axis=1, inplace=True)
_ = item_categories.drop(['item_category_name'],axis=1, inplace=True)
_ = item_categories.drop(['translated_item_category_name'],axis=1, inplace=True)

item_categories.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)
item_categories.rename(columns = {'item_category_name_fix':'item_category_name'}, inplace = True)
item_categories = item_categories.drop_duplicates()
item_categories.index = np.arange(0, len(item_categories))
item_categories = item_categories.sort_values(by=['item_category_id']).reset_index(drop=True)
item_categories

Idea 0: Add previous shop/item sales as feature (Lag feature)

In [None]:
if enable_feature_idea[0]:
    for diff in tqdm(lookback_range):
        feature_name = 'prev_shopitem_sales_' + str(diff)
        trainset2 = train_test_set.copy()
        trainset2.loc[:, 'date_block_num'] += diff
        trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
        new_features.append(feature_name)
train_test_set.head(3)

Idea 1: Add previous item sales as feature (Lag feature)

In [None]:
if enable_feature_idea[1]:
    groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
    for diff in tqdm(lookback_range):
        feature_name = 'prev_item_sales_' + str(diff)
        result = groups.agg({'item_cnt_month':'mean'})
        result = result.reset_index()
        result.loc[:, 'date_block_num'] += diff
        result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
        new_features.append(feature_name)        
train_test_set.head(3)

Idea 2: Add previous shop/item price as feature (Lag feature)

In [None]:
if enable_feature_idea[2]:
    groups = train_test_set.groupby(by = ['shop_id', 'item_id', 'date_block_num'])
    for diff in tqdm(lookback_range):
        feature_name = 'prev_shopitem_price_' + str(diff)
        result = groups.agg({'item_price':'mean'})
        result = result.reset_index()
        result.loc[:, 'date_block_num'] += diff
        result.rename(columns={'item_price': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(result, on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name]
        new_features.append(feature_name)        
train_test_set.head(3)

Idea 3: Add previous item price as feature (Lag feature)

In [None]:
if enable_feature_idea[3]:
    groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
    for diff in tqdm(lookback_range):
        feature_name = 'prev_item_price_' + str(diff)
        result = groups.agg({'item_price':'mean'})
        result = result.reset_index()
        result.loc[:, 'date_block_num'] += diff
        result.rename(columns={'item_price': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name]
        new_features.append(feature_name)        
train_test_set.head(3)

Idea 4: Mean encodings for shop/item pairs(Mean encoding, doesnt work for me)

In [None]:
def create_mean_encodings(train_test_set, categorical_var_list, target):
    feature_name = "_".join(categorical_var_list) + "_" + target + "_mean"

    df = train_test_set.copy()
    df1 = df[df.date_block_num <= 32]
    df2 = df[df.date_block_num <= 33]
    df3 = df[df.date_block_num == 34]

    # Extract mean encodings using training data(here we don't use month 33 to avoid data leak on validation)
    # If I try to extract mean encodings from all months, then val rmse decreases a tiny bit, but test rmse would increase by 4%
    # So this is important
    mean_32 = df1[categorical_var_list + [target]].groupby(categorical_var_list, as_index=False)[[target]].mean()
    mean_32 = mean_32.rename(columns={target:feature_name})

    # Extract mean encodings using all data, this will be applied to test data
    mean_33 = df2[categorical_var_list + [target]].groupby(categorical_var_list, as_index=False)[[target]].mean()
    mean_33 = mean_33.rename(columns={target:feature_name})

    # Apply mean encodings
    df2 = df2.merge(mean_32, on = categorical_var_list, how = 'left')
    df3 = df3.merge(mean_33, on = categorical_var_list, how = 'left')

    # Concatenate
    train_test_set = pd.concat([df2, df3], axis = 0)
    new_features.append(feature_name)
    return train_test_set

In [None]:
create_mean_encodings(train_test_set, ['shop_id', 'item_id'], 'item_cnt_month')
train_test_set.head(3)

Idea 5: Mean encodings for item (Mean encoding, doesnt work for me)

In [None]:
train_test_set = create_mean_encodings(train_test_set, ['item_id'], 'item_cnt_month')
train_test_set.head(3)

Idea 6: Number of month from last sale of shop/item (Use info from past)

In [None]:
def create_last_sale_shop_item(row):
    for diff in range(1,33+1):
        feature_name = '_prev_shopitem_sales_' + str(diff)
        if row[feature_name] != 0.0:
            return diff
    return np.nan

lookback_range = list(range(1, 33 + 1))
if enable_feature_idea[6]:
    for diff in tqdm(lookback_range):
        feature_name = '_prev_shopitem_sales_' + str(diff)
        trainset2 = train_test_set.copy()
        trainset2.loc[:, 'date_block_num'] += diff
        trainset2.rename(columns={'item_cnt_month': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(trainset2[['shop_id', 'item_id', 'date_block_num', feature_name]], on = ['shop_id', 'item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
        #new_features.append(feature_name)

train_test_set.loc[:, 'last_sale_shop_item'] = train_test_set.progress_apply (lambda row: create_last_sale_shop_item(row),axis=1)
new_features.append('last_sale_shop_item')

Idea 7: Number of month from last sale of item(Use info from past)

In [None]:
def create_last_sale_item(row):
    for diff in range(1,33+1):
        feature_name = '_prev_item_sales_' + str(diff)
        if row[feature_name] != 0.0:
            return diff
    return np.nan
    
lookback_range = list(range(1, 33 + 1))
if enable_feature_idea[7]:
    groups = train_test_set.groupby(by = ['item_id', 'date_block_num'])
    for diff in tqdm(lookback_range):
        feature_name = '_prev_item_sales_' + str(diff)
        result = groups.agg({'item_cnt_month':'mean'})
        result = result.reset_index()
        result.loc[:, 'date_block_num'] += diff
        result.rename(columns={'item_cnt_month': feature_name}, inplace=True)
        train_test_set = train_test_set.merge(result, on = ['item_id', 'date_block_num'], how = 'left')
        train_test_set[feature_name] = train_test_set[feature_name].fillna(0)
        new_features.append(feature_name)        
train_test_set.loc[:, 'last_sale_item'] = train_test_set.progress_apply (lambda row: create_last_sale_item(row),axis=1)

Idea 8: Item name (Tfidf text feature)

In [None]:
items_subset = items[['item_id', 'item_name']]
feature_count = 25
tfidf = TfidfVectorizer(max_features=feature_count)
items_df_item_name_text_features = pd.DataFrame(tfidf.fit_transform(items_subset['item_name']).toarray())

cols = items_df_item_name_text_features.columns
for i in range(feature_count):
    feature_name = 'item_name_tfidf_' + str(i)
    items_subset[feature_name] = items_df_item_name_text_features[cols[i]]
    new_features.append(feature_name)

items_subset.drop('item_name', axis = 1, inplace = True)
train_test_set = train_test_set.merge(items_subset, on = 'item_id', how = 'left')
train_test_set.head()