In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# load packages
import sys # access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import numpy as np # scientific computing
print("NumPy version: {}". format(np.__version__))

import pandas as pd # data processing and analysis 
print("pandas version: {}". format(pd.__version__))

import matplotlib
import matplotlib.pyplot as plt # plotting
%matplotlib inline 
print("matplotlib version: {}". format(matplotlib.__version__))
 
import sklearn # machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

from itertools import product
from xgboost import XGBRegressor
from xgboost import plot_importance
import time
from sklearn.preprocessing import LabelEncoder

print("----------")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data
Either original data files or the ones prepared in PFS-dataAnalysis&cleanup.


In [None]:
# read original data
df_items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
#df_shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv') # read cleaned version instead later
df_sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
df_test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
#df_item_cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

# read my versions
df_shops = pd.read_csv('../input/pfs-dataanalysis-cleanup/df_shops_cleaned.csv')
df_shops.drop(df_shops.columns[0], axis= 1, inplace=True)
month_info = pd.read_pickle('../input/pfs-dataanalysis-cleanup/month_info.pkl')
df_item_cats = pd.read_csv('../input/pfs-dataanalysis-cleanup/df_item_cat_cleaned.csv')
df_item_cats.drop(df_item_cats.columns[0], axis= 1, inplace=True)
all_data = pd.read_pickle('../input/pfs-dataanalysis-cleanup/all_data_cleaned.pkl')
all_data.head()

# Mean Encodings
Like done in Programming Assignment week 3. The mean encodings are only calculated on the training part of the data. So let's do this before concatenating train and test. 

Unfortunatelly they still generate target leakage. None of them is used in the final model. Any suggestions about what is wrong are very welcome.

In [None]:
# use code from PA w3
ts = time.time()
# simple item target encoding - ATTENTION do not use as feature due to leakage/overfitting
item_id_target_mean = all_data.groupby('item_id').target.mean()
all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)
global_mean = all_data.item_target_enc.mean()
all_data['item_target_enc'].fillna(global_mean, inplace=True) # for the new items in date_block_num 34 no mean can be calculated, fill with global_mean

# k fold encoding
from sklearn.model_selection import KFold
kf = KFold(5, shuffle=False)
all_data['item_target_enc_kf'] = np.nan

for train_index, val_index in kf.split(all_data):
    X_train, X_val = all_data.iloc[train_index, :], all_data.iloc[val_index, :]
    fold_means = X_train.groupby("item_id").target.mean()
    means = X_val["item_id"].map(fold_means).values
    all_data.iloc[val_index, np.where(all_data.columns=='item_target_enc_kf')[0]] = means

all_data['item_target_enc_kf'].fillna(global_mean, inplace=True) # item_ids not present in a fold will get nan, they are filled with global_mean

# leave one out encoding
summe = all_data.groupby('item_id')['target'].transform('sum')
tgo = all_data.target
n_objects = all_data.groupby('item_id')['target'].transform('count')
all_data['item_target_enc_loo'] = (summe - tgo) / (n_objects -1)
all_data['item_target_enc_loo'].fillna(global_mean, inplace=True) # new items in 34 got 0, set to global_mean

# smoothing
nrows = all_data.groupby('item_id')['target'].transform('count')
meann = all_data.groupby('item_id').target.transform('mean')
all_data['item_target_enc_sm'] = (meann*nrows + (global_mean*100))/(nrows+100)

# Expanding Mean Scheme
cumcnt = all_data.groupby('item_id').target.cumcount()
cumsumm = all_data.groupby('item_id').target.cumsum()
all_data['item_target_enc_ems']= (cumsumm -all_data.target)/cumcnt
all_data.item_target_enc_ems = all_data.item_target_enc_ems.fillna(global_mean)

# downcast all columns that are needed later
all_data.item_target_enc_kf = all_data.item_target_enc_kf.astype(np.float32)
all_data.item_target_enc_loo = all_data.item_target_enc_loo.astype(np.float32)
all_data.item_target_enc_sm = all_data.item_target_enc_sm.astype(np.float32)
all_data.item_target_enc_ems = all_data.item_target_enc_ems.astype(np.float32)
# drop no longer needed column
all_data = all_data.drop(columns='item_target_enc')

# get means of the mean encodings 
col = ['item_target_enc_kf','item_target_enc_loo','item_target_enc_sm','item_target_enc_ems']
group = all_data.groupby('item_id')[col].mean()
time.time() - ts

In [None]:
# prepare df_test with the same columns as all_data
df_test["date_block_num"] = 34
df_test["target"] = np.nan
df_test = df_test.set_index("ID")
df_test = pd.merge(df_test, group, on='item_id', how='left')
df_test[col] = df_test[col].fillna(global_mean) # fill means of mean encodings
df_test.head(7)

Merge train and test data. Makes feature engineering more simple.

In [None]:
all_data = all_data.append(df_test, ignore_index = False) # changed to False now to keep track of IDs in the test set. index restarts for month 34. But this is overwritten later anyway...
all_data.head(7)

Add category ID from df_item_cat and info about monthly data from month_info.

In [None]:
# add category id to trainset and testset (merge 'left', otherwise this sorts the data by item id, then by (kept) data_block_num, shop_id)
all_data = pd.merge(all_data, df_items[['item_id', 'item_category_id']], on = 'item_id', how='left')
all_data = pd.merge(all_data, month_info, on = 'date_block_num', how='left')
# set values for month 34 (test set)
all_data.loc[all_data.date_block_num == 34 ,'days_per_month'] = 30
all_data.loc[all_data.date_block_num == 34 ,'month'] = 11

# downcast for better memory usage
all_data['shop_id'] = all_data['shop_id'].astype(np.int8)
all_data['item_id'] = all_data['item_id'].astype(np.int16)
all_data['item_category_id'] = all_data['item_category_id'].astype(np.int8)
all_data['days_per_month'] = all_data['days_per_month'].astype(np.int8)
all_data['month'] = all_data['month'].astype(np.int8)

all_data.head()

# Label Encoding

In [None]:
# use LabelEncoder to generate city_code
le = LabelEncoder()
df_shops['city_code']= le.fit_transform(df_shops.city)
df_item_cats['item_bcat_code'] = le.fit_transform(df_item_cats.item_broad_cat)
df_item_cats['item_scat_code'] = le.fit_transform(df_item_cats.item_sub_cat)

In [None]:
# merge additional shop columns into all_data
all_data = pd.merge(all_data, df_shops[['shop_id','city_code']], on='shop_id', how='left' )
all_data = pd.merge(all_data, df_item_cats[['item_category_id','item_bcat_code', 'item_scat_code']], on='item_category_id', how='left')

# downcast for better memory usage
all_data['city_code'] = all_data['city_code'].astype(np.int8)
all_data['item_bcat_code'] = all_data['item_bcat_code'].astype(np.int8)
all_data['item_scat_code'] = all_data['item_scat_code'].astype(np.int8)

In [None]:
# free up memory
del df_shops
del df_item_cats
del month_info

In [None]:
all_data.tail()

# Lag Features

Project Advice week 3:

You can get a rather good score after creating some lag-based features like in advice from previous week and feeding them into gradient boosted trees model.

Apart from item/shop pair lags you can try adding lagged values of total shop or total item sales (which are essentially mean-encodings). All of that is going to add some new information.

Let's do that.

In [None]:
# lag function, designed to work with all_data and different lags for just one column at the moment

def lag_feature (df, lag, column):
    # create a temporary df with just the columns needed to merge back and calculate the lag
    temp = df[['shop_id', 'item_id', 'date_block_num', column]]
    # loop to go through each lag time (number of month)
    for i in lag:
        shifted = temp.copy()
        # give the columns of the shifted df the "lag name"
        shifted.columns = ['shop_id','item_id','date_block_num',column+'_lag_'+str(i)]
        # increase month number
        shifted['date_block_num'] += i
        # merge it back to the original df given to the function. 'left' -> keep original keys
        df = pd.merge(df, shifted, on=['shop_id','item_id','date_block_num'], how='left')
    return df

# shop_id, item_id, date_block_num combinations that are not present in the month before are filled with Nan
# this means that in the first month(s) ALL lag values are Nan

Is it better to have the lag features for sum or mean? I used sum because the final project advice said "total shop or item sales". I tried mean as well. The validation rsme 1 and the public score were a bit higher (i.e worse). See Version 13 compared to Version 14. Changed it back to sum.

In [None]:
ts = time.time()

# create new columns for total item sales (over all shops) and total shop sales (over all items) in each month
# month_item_sum -> how many times was an item_id sold over all shops in each month?
# month_shop_sum -> how many items were sold in each shop in each month?

group = all_data.groupby(['date_block_num', 'item_id']).agg({'target': 'sum'})
group.columns = ['month_item_sum']
group.reset_index(inplace=True)
all_data = pd.merge(all_data, group, on=['date_block_num','item_id'], how='left')

group = all_data.groupby(['date_block_num', 'shop_id']).agg({'target': 'sum'})
group.columns = ['month_shop_sum']
group.reset_index(inplace=True)

all_data = pd.merge(all_data, group, on=['date_block_num','shop_id'], how='left')


# add lag features for all the month given in lag_time
lag_time = [1,2,3,4,5, 12]

all_data = lag_feature(all_data, lag_time, 'target')  # shop-item lag
all_data = lag_feature(all_data, lag_time, 'month_item_sum') # total item lag
all_data = lag_feature(all_data, lag_time, 'month_shop_sum') # total shop lag
all_data = lag_feature(all_data, lag_time, 'month_sum') # total month lag, no of items sold per month
all_data = lag_feature(all_data, lag_time, 'monthly_transactions') # total month lag, no of transactions per month


# fill Nan in the lag columns with 0 as no value means the item has not been sold the month before
columnliste = list(all_data.columns)
laglist=[]
# create a list with all columns containing 'lag'
for element in columnliste:
    if 'lag' in element:
        laglist.append(element)
        
# fill NaNs for all lag columns and downcast       
for element in laglist:
    all_data[element].fillna(0, inplace=True)
    all_data[element] = all_data[element].astype(np.float32)

# drop the columns that were used to generate lag features, they are no longer needed
all_data.drop(columns= ['month_item_sum','month_shop_sum', 'monthly_transactions', 'month_sum'], axis= 1, inplace=True)
all_data.head(5)
    
time.time() - ts

In [None]:
all_data.columns

## Price Features
Experiment with adding (average) item price. Pure average price gave worse results. Use price trend instead.

In [None]:
# remove outlies from df_sales, like it has been done when generating all_data
df_sales = df_sales[df_sales.item_price < 100000]
df_sales = df_sales[df_sales.item_cnt_day < 1000]

In [None]:
ts = time.time()

# get average price per item_id over whole time period
group = df_sales.groupby('item_id').agg({'item_price' : 'mean'})
group.columns = ['avg_price']
all_data = pd.merge(all_data, group, on= 'item_id', how = 'left')
all_data['avg_price'] = all_data['avg_price'].astype(np.float32)

# get average price per item_id in each month
group = df_sales.groupby( ["date_block_num","item_id"] ).agg( {"item_price": ["mean"]} )
group.columns = ["month_avg_price"]
all_data = all_data.merge(group, on = ["date_block_num","item_id"], how = "left")
all_data["month_avg_price"] = all_data['month_avg_price'].astype(np.float32)

# create lags for average item price in a month, i.e. how much did this item cost in average the month before? the two month before?...
# the lag columns contain NaN when an item has not been sold the 1/2/3 month before
lag_time = [1, 2, 3]
all_data = lag_feature(all_data, lag_time, 'month_avg_price' )

# calculate by how much percent the item price for the last 1/2/3 month was above/below overall average price
# the lags have NaNs where the price change could not be computed
for i in lag_time:
    all_data["delta_price_lag_" + str(i) ] = (all_data["month_avg_price_lag_" + str(i)]- all_data["avg_price"]) / all_data["avg_price"]

# calculate if there was a price drop/rise in the last month
# the lags have NaNs where the price change could not be computed
all_data["price_trend"] = (all_data["month_avg_price_lag_1"] - all_data["month_avg_price_lag_2"]) / all_data["month_avg_price_lag_1"]
# (what has the item cost the month before - what does it cost this month) / what does it cost this month --- cannot be comupted due to the NaNs in month 34, instad:
# (what has the item cost the month before - what has it cost 2 month before) / what has the item cost the month before

ts = time.time()
def select_trends(row) :
    for i in lag_time:
        if row["delta_price_lag_" + str(i)]:
            return row["delta_price_lag_" + str(i)]
    return 0

### 
all_data["delta_price_lag_1"].fillna( 0 ,inplace = True)
all_data["delta_price_lag_2"].fillna( 0 ,inplace = True)
all_data["delta_price_lag_3"].fillna( 0 ,inplace = True)
all_data["price_trend"].fillna( 0 ,inplace = True)
###

all_data["acc_price_lag"] = all_data.apply(select_trends, axis = 1)
all_data["acc_price_lag"] = all_data.acc_price_lag.astype(np.float32 )
time.time() - ts


# the function select_trends goes through every delta_price_lag_1/2/3 column and checks if there is a value present
# if yes then this value is put into the column delta price_lag, if no then the next delta_price_lag_2/3 is checked
# if nothing is found then 0 is returned
# this function is applied to every row of the matrix. That is why it takes so long. 
# delta_price_lag contains the percentual change in price compared to the overall average price (for the last 1/2/3 month)). 
# i.e. in the last month, how much higher/lower was the price compared to the overall average price?
# Note: there will be 0 for the new items in month 34, as they dont have a price, no price related features can be calculated.

# drop the no longer needed columns
columnliste = list(all_data.columns)
columns_to_drop=[]
# create a list with all columns containing 'lag'
for element in columnliste:
    if 'avg_price' in element:
        columns_to_drop.append(element)
    if 'delta_price_lag' in element:
        columns_to_drop.append(element)

all_data = all_data.drop(columns=columns_to_drop)

time.time() - ts

In [None]:
del df_sales
all_data.head()

# Item Name Features
I am not satisfied with the name features yet but I am running out of time. Pure code from Gordon Henderson receives better results, I dont understand yet why, as some lines seem strange. Didn't wanna copy his code completely, my version is close enough.

Not all of the item name features are used in the final model. Some made the score worse.

In [None]:
# code inspired by Gordon Henderson, adapted: it moves everything in [] to name_sqbarc and everying in () to name_brac. The rest of the item name is in name_cut

# split item names by brackets
df_items["name_cut"], df_items["name_sqbrac"] = df_items.item_name.str.split("[", 1).str
df_items["name_cut"], df_items["name_brac"] = df_items.name_cut.str.split("(", 1).str

# replace special characters and turn to lower case
df_items["name_sqbrac"] = df_items.name_sqbrac.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
df_items["name_brac"] = df_items.name_brac.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
df_items["name_cut"] = df_items.name_cut.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

df_items= df_items.fillna('0')

In [None]:
df_items.iloc[[2,27,191,1358,3872,3864,4577,6389,7297, 7839 ,9196,10365,22083], :]

In [None]:
df_items["type"] = df_items.name_sqbrac.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
df_items.loc[(df_items.type == "x360") | (df_items.type == "xbox360") | (df_items.type == "xbox 360") ,"type"] = "xbox 360"

df_items.type = df_items.type.apply( lambda x: x.replace(" ", "") )
df_items.loc[ (df_items.type == 'pс' )| (df_items.type == 'pc') | (df_items.type == "pc"), "type" ] = "pc" # for reasons I don't understand pc seems to have different character encodings
df_items.loc[ df_items.type == 'рs3' , "type"] = "ps3"
df_items.loc[ df_items.type == "", "type"] = "mac"

In [None]:
# get first word and first+second word from the rest of the item name
df_temp_name = df_items['name_cut'].str.split(n=2, expand=True)
df_temp_name.columns = ['word1', 'word2', 'rest']
df_temp_name= df_temp_name.fillna(0)

def set_word12 (row):
    if row.word2 == 0:
        return row.word1
    else:
        return row.word1 + '-' + row.word2
    
df_temp_name[['word12']] = df_temp_name.apply(lambda row: set_word12(row), axis=1)

# merge info about words back into df_items
df_items = pd.concat([df_items, df_temp_name[['word1','word12']]], axis=1)

In [None]:
# check number of unique values in the columns
print(df_items.name_cut.nunique())
print(df_items.name_sqbrac.nunique())
print(df_items.name_brac.nunique())
print(df_items.word1.nunique())
print(df_items.word12.nunique())
print(df_items.type.nunique())
#df_items.name_sqbrac.value_counts()

In [None]:
# count how many entries in each of the columns are present less than 40 times
liste = ('name_cut','name_sqbrac', 'name_brac', 'word1','word12', 'type')
for col in liste:
    group = df_items.groupby([col]).agg({"item_id": "count"}) # count how many items there are 
    group = group.reset_index()
    drop_cols = []
    for cat in group[col].unique():
        if group.loc[(group[col] == cat), "item_id"].values[0] <40:
            drop_cols.append(cat)
        
    print(col,": no of drop_cols: ", len(drop_cols)   )     

In [None]:
df_items.iloc[[2,27,191,1358,3872,3864,4577,6389,7297, 7839 ,9196,10365,22083], :]

In [None]:
# Label Encoding
df_items['name_sqbrac'] = le.fit_transform(df_items.name_sqbrac.astype(str))
df_items['name_brac'] = le.fit_transform(df_items.name_brac.astype(str))
df_items['word1'] = le.fit_transform(df_items.word1.astype(str))
df_items['word12'] = le.fit_transform(df_items.word12.astype(str))
df_items['type'] = le.fit_transform(df_items.type.astype(str))

# memory saving
df_items['name_sqbrac'] = df_items['name_sqbrac'].astype(np.int16)
df_items['name_brac'] = df_items['name_brac'].astype(np.int16)
df_items['word1'] = df_items['word1'].astype(np.int16)
df_items['word12'] = df_items['word12'].astype(np.int16)
df_items['type'] = df_items['type'].astype(np.int16)

# merge back into all data
all_data = pd.merge(all_data, df_items[['item_id','name_sqbrac', 'name_brac', 'word1', 'word12','type']], on= 'item_id', how = 'left')
all_data.head()

del df_items

# Sanity Checks & Clipping
Let's do some sanity checks before saving the data.


In [None]:
all_data[all_data.date_block_num == 34]

In [None]:
all_data.isna().any()

In [None]:
all_data.info()

In [None]:
all_data.target = all_data.target.clip(0,20)

In [None]:
all_data.target.describe()

In [None]:
# save all_data to be used in next notebook
all_data.to_pickle('all_data_withFeatures.pkl')

# unused code snippets

generating item type

df_items["type"] = df_items.name_sqbrac.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
df_items.loc[(df_items.type == "x360") | (df_items.type == "xbox360") | (df_items.type == "xbox 360") ,"type"] = "xbox 360"
df_items.loc[ df_items.type == "", "type"] = "mac"
df_items.type = df_items.type.apply( lambda x: x.replace(" ", "") )
df_items.loc[ (df_items.type == 'pс' )| (df_items.type == 'рс') | (df_items.type == "pc"), "type" ] = "pc" # for reasons I don't understand pc seems to have different character encodings
df_items.loc[ df_items.type == 'рs3' , "type"] = "ps3"

group = df_items.groupby(["type"]).agg({"item_id": "count"}) # count how many items there are for each type
group = group.reset_index()
drop_cols = []
for cat in group.type.unique():
    if group.loc[(group.type == cat), "item_id"].values[0] <40:
        drop_cols.append(cat)
df_items.type = df_items.type.apply( lambda x: "other" if (x in drop_cols) else x ) # is applied to name2 in original but why? 
df_items = df_items.drop(["type"], axis = 1)

Are there cities that sell more items than other cities? Divide total sales by number of shops in this city. Otherwise Moskow is too high in the sky.
extract the total number of items sold by one shop over the whole time period
total_items_sold = all_data.groupby('shop_id').agg({'target': 'sum'})
total_items_sold.columns = ['total_items_sold']
merge this info into the shops table
df_shops = pd.merge(df_shops, total_items_sold, on= 'shop_id', how='left')
extract how many items have been sold per city. Use 'mean' because this gives the total number divided by the number of shops
city_sales_mean = df_shops.groupby('city_code').agg({'total_items_sold': 'mean'})
city_sales_mean.columns = ['mean_items_sold']
city_sales_total = df_shops.groupby('city_code').agg({'total_items_sold': 'sum'})
city_sales_total.columns = ['total_items_sold']
get number of shops per city into the city_sales df - not needed at the moment
#no_shops_per_city = df_shops.city_code.value_counts()
#no_shops_per_city = no_shops_per_city.to_frame().reset_index()
#no_shops_per_city = no_shops_per_city.rename(columns={'city_code' : 'no_shops_per_city', 'index' : 'city_code'}) 
#city_sales= pd.merge(city_sales, no_shops_per_city, on='city_code', how='left')
add average number of items sold per city = total number / number of shops in this city --- this is if you want to calculate 'mean' manually with a lot of code
#city_sales['avg_items_sold_city'] = city_sales.total_items_sold / city_sales.no_shops_per_city

visualize
fig = plt.figure() # create figure
ax0 = fig.add_subplot(1, 3, 1) # add subplot 1 (1 row, 2 columns, first plot)
ax1 = fig.add_subplot(1, 3, 2) 
ax2 = fig.add_subplot(1, 3, 3)
df_shops[['total_items_sold']].plot(figsize=(20, 3), ax=ax0)
city_sales_total.plot(figsize=(20, 3), ax=ax1)
city_sales_mean.plot(figsize=(20, 3), ax=ax2)
ax0.set_title('Total Number of Items Sold per shop_id')
ax0.set_xlabel('shop_id')
ax1.set_title('Total Number of Items Sold per city')
ax1.set_xlabel('city_code')
ax2.set_title('Mean Number of Items Sold per city') # =total city sales / no shops in this city
ax2.set_xlabel('city_code')
plt.show()

df_shops = pd.merge(df_shops, city_sales_mean, on='city_code', how='left')
df_shops = df_shops.rename(columns={'mean_items_sold':'mean_items_sold_city', 'total_items_sold':'total_items_sold_shop'})

create two new columns ATTENTION: the numbers to devide need to be revisited because of changed clipping strategy, not done yet, I will not use in model
representing if a shop belongs to a high/medium/low selling city (by mean items sold)
and if this is a high, medium or low selling shop

df_shops.loc[df_shops.mean_items_sold_city < 60000, 'mean_items_sold_city'] = 0 # low selling city
df_shops.loc[(df_shops.mean_items_sold_city >= 60000) & (df_shops.mean_items_sold_city < 100000), 'mean_items_sold_city'] = 1 # medium selling city
df_shops.loc[df_shops.mean_items_sold_city >=100000, 'mean_items_sold_city'] = 2 # high selling city
df_shops = df_shops.rename(columns={'mean_items_sold_city':'city_sales_code'})

df_shops.loc[df_shops.total_items_sold_shop < 80000, 'total_items_sold_shop'] = 0 # low selling shop
df_shops.loc[(df_shops.total_items_sold_shop >= 80000) & (df_shops.total_items_sold_shop < 150000), 'total_items_sold_shop'] = 1 # medium selling shop
df_shops.loc[df_shops.total_items_sold_shop >=150000, 'total_items_sold_shop'] = 2 # high selling shop
df_shops = df_shops.rename(columns={'total_items_sold_shop':'shop_sales_code'})

#df_shops.head()  

Code for getting average category price and put either this or average price in the average price column. 

#function for checking if there is an average price and returning either it or the average category price


def set_price (row):
    if np.isnan(row.avg_price):
        row.avg_price = row.avg_category_price
    return row.avg_price

#there are 363 new items for which no average price can be calculated, put the average price of the category instead

group = all_data.groupby(['item_category_id']).agg({'avg_price':'mean'})
group.columns = ['avg_category_price']
all_data = pd.merge(all_data, group, on= 'item_category_id', how = 'left')

col = all_data[all_data.date_block_num == 34].apply (lambda row: set_price(row), axis=1)

#assign values from col to month 34 from all_data
all_data[all_data.date_block_num == 34] = all_data[all_data.date_block_num == 34].assign(avg_price = col.values) 
all_data.drop(columns='avg_category_price', inplace=True) # no longer needed



# former execution log

Version 6:

Features used: columns = ["shop_id", "item_id", "date_block_num", "item_category_id", 'target_lag_1', 'target_lag_12']

XGBoost : 

Stopping. Best iteration:
[70]	validation_0-rmse:2.71430	validation_1-rmse:4.84891


Training time: 381

Public score: 1.13677

Version 9:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'month_item_sum', 'month_shop_sum', 
       'target_lag_1', 'target_lag_12',
       'month_item_sum_lag_1', 'month_item_sum_lag_12', 
       'month_shop_sum_lag_1', 'month_shop_sum_lag_12']

XGBoost : 

Stopping. Best iteration:
[128]	validation_0-rmse:2.11774	validation_1-rmse:4.48538



Training time: 932

Public score: 1.18514

Version 11:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', **'month_item_sum', 'month_shop_sum',** 'target_lag_1', 'target_lag_12', 'month_item_sum_lag_1', 'month_item_sum_lag_12', 'month_shop_sum_lag_1', 'month_shop_sum_lag_12']

**used cleaned data**. i.e. shop names cleaned AND data clipped after building grid

XGBoost :
    
Stopping. Best iteration:
[150]	validation_0-rmse:0.64435	validation_1-rmse:0.73362

Training time: 1141

Public score: 1.23784

Version 13:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'target_lag_1', 'target_lag_12', 'month_item_sum_lag_1', 'month_item_sum_lag_12', 'month_shop_sum_lag_1', 'month_shop_sum_lag_12']

**used cleaned data**. i.e. shop names cleaned AND data clipped after building grid

XGBoost :

Stopping. Best iteration:
[44]	validation_0-rmse:0.84370	validation_1-rmse:0.92533

Training time: 369

Public score: 0.95375



Version 14:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'target_lag_1', 'target_lag_12',
        'month_item_**mean**_lag_1', 'month_item_**mean**_lag_12', 
       'month_shop_**mean**_lag_1', 'month_shop_**mean**_lag_12']

used cleaned data. i.e. shop names cleaned AND data clipped after building grid

XGBoost :

Stopping. Best iteration:
[28]	validation_0-rmse:0.85613	validation_1-rmse:0.93336

Training time: 279

Public score: 0.95996

Version 19:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'target', 'item_category_id',
           'target_lag_1', 'target_lag_12',
           'month_item_**sum**_lag_1', 'month_item_**sum**_lag_12', 
           'month_shop_**sum**_lag_1','month_shop_**sum**_lag_12', 
           '**city_code', 'shop_sales_code','city_sales_code'**]

used cleaned data. i.e. shop names cleaned AND data clipped after building grid

XGBoost :

[999]	validation_0-rmse:0.00564	validation_1-rmse:0.00656



Training time: 6284

Public score: I did not dare to submit. the min value of the prediction was 19.47. And I found my error. 

Version 20:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'target_lag_1', 'target_lag_12', 'month_item_sum_lag_1', 'month_item_sum_lag_12', 'month_shop_sum_lag_1','month_shop_sum_lag_12', 'city_code', 'shop_sales_code','city_sales_code']

used cleaned data. i.e. shop names cleaned AND data clipped after building grid

XGBoost :

Stopping. Best iteration:
[45]	validation_0-rmse:0.84357	validation_1-rmse:0.92975

Training time: 404

Public score: 0.96130

Version 23:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
           'target_lag_1', 'target_lag_12',
           'month_item_sum_lag_1', 'month_item_sum_lag_12', 
           'month_shop_sum_lag_1','month_shop_sum_lag_12']  --- like V13
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost :
Stopping. Best iteration:
[27]	validation_0-rmse:0.85564	validation_1-rmse:0.93935

Training time: 342

Public score: 0.96739

Version 24:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
           'target_lag_1', 'target_lag_12',
           'month_item_sum_lag_1',**'month_item_sum_lag_2','month_item_sum_lag_3**', 'month_item_sum_lag_12', 
           'month_shop_sum_lag_1',**'month_shop_sum_lag_2','month_shop_sum_lag_3'**,'month_shop_sum_lag_12'] 
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost :
Stopping. Best iteration:
[26]	validation_0-rmse:0.84397	validation_1-rmse:0.93156

Training time: 432

Public score: 0.95726

Version 27:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
           'city_code', 'item_bcat_code', 
           'target_lag_1', 'target_lag_12',
           'month_item_sum_lag_1','month_item_sum_lag_2','month_item_sum_lag_3', 'month_item_sum_lag_12', 
           'month_shop_sum_lag_1','month_shop_sum_lag_2','month_shop_sum_lag_3','month_shop_sum_lag_12',] 
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost :
Stopping. Best iteration:
[35]	validation_0-rmse:0.83877	validation_1-rmse:0.92985

Training time: 394

Public score: 0.95394

Version 28:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
           'city_code', 'item_bcat_code', 
           'target_lag_1', 'target_lag_12',
           'month_item_sum_lag_1','month_item_sum_lag_2','month_item_sum_lag_3', 'month_item_sum_lag_12', 
           'month_shop_sum_lag_1','month_shop_sum_lag_2','month_shop_sum_lag_3','month_shop_sum_lag_12',] 
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost : **changed eta to 0.2 (from 0.3)**

Stopping. Best iteration:
[60]	validation_0-rmse:0.83157	validation_1-rmse:0.92357

Training time: 590

Public score: 0,94763

Version 29 (genau wie v31, beim Speichern ging was schief):

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'city_code', 'item_bcat_code', 
       'days_per_month', 'month',
       'target_lag_1', 'target_lag_2', 'target_lag_3', 'target_lag_12',
       'month_item_sum_lag_1', 'month_item_sum_lag_2', 'month_item_sum_lag_3',
       'month_item_sum_lag_12', 'month_shop_sum_lag_1', 'month_shop_sum_lag_2',
       'month_shop_sum_lag_3', 'month_shop_sum_lag_12', 'month_sum_lag_1',
       'month_sum_lag_2', 'month_sum_lag_3', 'month_sum_lag_12',
       'monthly_transactions_lag_1', 'monthly_transactions_lag_2',
       'monthly_transactions_lag_3', 'monthly_transactions_lag_12']
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost : **changed eta back to 0.3, because of faster training**

Stopping. Best iteration:
[5]	validation_0-rmse:0.87967	validation_1-rmse:0.94554


Training time: 239 (191 in v31)

Public score: 0,96909

Version 35:

Features used: = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'city_code', 'item_bcat_code', 
       'days_per_month', 'month', '**avg_price**',
       'target_lag_1', 'target_lag_2', 'target_lag_3', 'target_lag_12',
       'month_item_sum_lag_1', 'month_item_sum_lag_2', 'month_item_sum_lag_3',
       'month_item_sum_lag_12', 'month_shop_sum_lag_1', 'month_shop_sum_lag_2',
       'month_shop_sum_lag_3', 'month_shop_sum_lag_12', 'month_sum_lag_1',
       'month_sum_lag_2', 'month_sum_lag_3', 'month_sum_lag_12',
       'monthly_transactions_lag_1', 'monthly_transactions_lag_2',
       'monthly_transactions_lag_3', 'monthly_transactions_lag_12']
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling

XGBoost : 

Stopping. Best iteration:
[16]	validation_0-rmse:0.83772	validation_1-rmse:0.94485


Training time: 321

Public score: 0,95793

Version 36:

Features used: columns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id',
       'city_code', 'item_bcat_code', 
       'days_per_month', 'month', 'avg_price',
       'target_lag_1', 'target_lag_2', 'target_lag_3',
       'month_item_sum_lag_1', 'month_item_sum_lag_2', 'month_item_sum_lag_3',
        'month_shop_sum_lag_1', 'month_shop_sum_lag_2','month_shop_sum_lag_3', 
        'month_sum_lag_1','month_sum_lag_2', 'month_sum_lag_3',
       'monthly_transactions_lag_1', 'monthly_transactions_lag_2','monthly_transactions_lag_3']
           
used new cleaned data. i.e. shop names cleaned AND data clipped before modelling, removed 12er lags and more month training data

XGBoost : 

Stopping. Best iteration:
[16]	validation_0-rmse:0.86907	validation_1-rmse:0.95285


Training time: 539

Public score: 0,95022