# Build a DF relating item category name and shop name to shop ids and item ids

In [1]:
TARGET_ENCODING_NPERM = 8

### Load packages

In [2]:
import pandas as pd
import pickle
import numpy as np
import re
import gc
from sklearn import preprocessing

### Read in Data

In [3]:
df = pd.read_csv('../gen_data/train2.csv')

y_train_list = pickle.load(open('../gen_data/y_train--features1.ipynb--.pickle','rb'))
x_train_list = pickle.load(open('../gen_data/x_train--features3.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features3.ipynb--.pickle','rb'))

In [4]:
x_train_list[1].shape

(6425094, 70)

### Parse City data

In [5]:
cities_encoder = preprocessing.LabelEncoder()
cities_series = pd.Series([re.search('(.*?) ',n).group() for n in df.shop_name],index=df.index)
cities_encoder.fit(cities_series)
df['city'] = cities_encoder.transform(cities_series)
del cities_series

### Parse item category data

In [6]:
item_type_encoder = preprocessing.LabelEncoder()
item_type_series = df.item_category_name.map(lambda x: re.split('-',x)[0])
item_type_encoder.fit(item_type_series)
df['item_type'] = item_type_encoder.transform(item_type_series)
del item_type_series

### parse more item_category data

In [7]:
item_info_encoder = preprocessing.LabelEncoder()
item_info_series = df.item_category_name.map(lambda x: '-'.join(re.split('-',x)[1:]) if len(re.split('-',x))>1 else 'NAN')
item_info_encoder.fit(item_info_series)
df['item_info'] = item_info_encoder.transform(item_info_series)
del item_info_series

### Select relevant columns in df

In [8]:
df = df[['shop_id','item_id','item_category_id','city','item_type','item_info']].drop_duplicates()

### function to downcast data types to 32 bits

In [9]:
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

### cleanup

In [10]:
df = downcast(df)
gc.collect()

7

### Merge data above into training and test sets

In [11]:
x_train_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_test_list))

# Target Encoding

Target encoding is based on historical information, available for both train and test datasets, as such, we needn't worry about regularisation specifically for target encodings.

In [12]:
def target_encode(x_train,y_train,x_test,cols,nperms):
    x_train['target'] = y_train
    print('top: '+str(x_train.shape))
#for test data
    for col in cols:
        agged_sum = x_train.groupby(col).target.sum()
        x_test[col+'_encoded'] = x_test[col].map(agged_sum)
        x_test[col+'_encoded'].fillna(0,inplace=True)
            
#for train data, need to regularise
    for col in cols:
        print('Encoding '+col+':')
        encoding_df = pd.DataFrame()
        for perm_num in range(0,nperms):
            print('\tComputing encoding on permutation '+str(perm_num)+' of '+str(nperms))
            #set random state to perm num for reproducibility
            #make this faster by just changing index?
            perm_x_train = x_train.sample(frac=1,random_state=perm_num)
            print('mid, perm: '+str(perm_x_train.shape))
            cumsum   = perm_x_train.groupby(col).target.cumsum() - perm_x_train.target
            cumcount = perm_x_train.groupby(col).target.cumcount()
            encoding = cumsum/cumcount
            
            #encoding_df = pd.concat([],axis=1)
            encoding_df['perm_'+str(perm_num)+'_encoding'] = encoding
            
        encoding_df['colmeans'] = encoding_df.mean(axis=1)    
        x_train[col+'_encoded'] = encoding_df.colmeans
        x_train[col+'_encoded'].fillna(0,inplace=True)
        print('later: '+str(x_train.shape))
        del encoding_df
        gc.collect()
    print('returning'+str(x_train.shape))
    return x_train.drop('target',axis=1) , x_test

In [13]:
to_target_encode = ['shop_id','item_id','item_category_id','month','city','item_type','item_info']
to_target_encode=['shop_id']
jumbled_list = [target_encode(x_train,y_train,x_test,to_target_encode,TARGET_ENCODING_NPERM) for x_train , y_train, x_test in zip(x_train_list,y_train_list,x_test_list)]
x_train_list , x_test_list = list(map(list,zip(*jumbled_list)))

top: (6186922, 74)
Encoding shop_id:
	Computing encoding on permutation 0 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 1 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 2 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 3 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 4 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 5 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 6 of 8
mid, perm: (6186922, 74)
	Computing encoding on permutation 7 of 8
mid, perm: (6186922, 74)
later: (6186922, 75)
returning(6186922, 75)
top: (6425094, 74)
Encoding shop_id:
	Computing encoding on permutation 0 of 8
mid, perm: (6425094, 74)
	Computing encoding on permutation 1 of 8
mid, perm: (6425094, 74)
	Computing encoding on permutation 2 of 8
mid, perm: (6425094, 74)
	Computing encoding on permutation 3 of 8
mid, perm: (6425094, 74)
	Computing encoding on permutation 4 of 8
mid, perm: (6425094, 74)
	Computing 

In [14]:
x_train_list[1].shape

(6425094, 74)

In [15]:
x_train_list[0].shape

(6186922, 74)

In [16]:
x_test_list[0].shape

(214200, 74)

In [17]:
x_test_list[1].shape

(214200, 74)

In [22]:
# need to add bool: item never sold before
                    #shop never sold before
# and int: num item sold
# num shop sold
# FOR EACH MONTH

#this is target encoding filling nans with 0 except missing early data

#doing this early in pipeline would be easy

#going back for all time might be a bad idea, the nature of the data would change thru time, (summingup)
#instaed just use dateback range

In [23]:
#create variables to quantify how much we know about a data point
#For each shop_id and item_id create sum of recent history(total sales for dateback_range)
#create bool if 0 indicating new shop or item
def generate_recent_history_summary(df):
    df_items = df.filter(regex='sum_item_sales_back_\d+$')
    df['historical_item_sales'] = df_items.sum(axis=1)
    df['new_item'] = (df.historical_item_sales<0.5)*1
    
    df_shops = df.filter(regex='sum_shop_sales_back_\d+$')
    df['historical_shop_sales'] = df_shops.sum(axis=1)
    df['new_shop'] = (df.historical_shop_sales<0.5)*1
    
    df_item_cats = df.filter(regex='sum_item_cat_sales_back_\d+$')
    df['historical_item_cat_sales'] = df_item_cats.sum(axis=1)
    df['new_item_cat'] = (df.historical_item_cat_sales<0.5)*1
    return df
    

In [24]:
x_train_list = list(map(generate_recent_history_summary,x_train_list))
x_test_list = list(map(generate_recent_history_summary,x_test_list))

In [25]:
x_train_list[0][['item_id','historical_item_sales']]

Unnamed: 0,item_id,historical_item_sales
0,32,0.0
1,33,118.0
2,99,0.0
3,482,1097.0
4,485,818.0
5,804,144.0
6,839,307.0
7,1007,463.0
8,1406,16.0
9,1407,726.0


### Save

In [26]:
pickle.dump(x_train_list,open('../gen_data/x_train--features4.ipynb--.pickle','wb'))
pickle.dump(x_test_list,open('../gen_data/x_test--features4.ipynb--.pickle','wb'))

In [None]:
x_train_list[0].head()

In [None]:
df = x_train_list[0][['date_block_num','item_id','item_id_encoded','historical_item_sales']].copy()
df.columns = ['date_block_num','item_id','ORIGINAL_item_id_encoded','historical_item_sales']
df.head()

In [None]:
#work on this
#this isnt working properly, drop it, I have almost the same information from elsewhere
def old_target_encode(df,target,cols,fillna=0):
    maxmonth = df.date_block_num.max()
    print(maxmonth)
    df['target'] = target
    for col in cols:
        print('Encoding variable: '+col)
        #dont use current data, that way we can do this for both train and test sets
        agged_targ = df[df.date_block_num<maxmonth].groupby(col).target.sum()
        agged_targ = df[(df.date_block_num<maxmonth) & (df.date_block_num>=(maxmonth-12))].groupby(col).target.sum()
        #agged_targ = df[df.date_block_num>20].groupby(col).target.sum()
        
        
        
        df[col+'_encoded'] = df[col].map(agged_targ)
        if fillna=='mean':
            df[col].fillna(df(col).mean(),inplace=True)
        elif fillna=='median':
            df[col].fillna(df(col).median(),inplace=True)
        else:
            df[col].fillna(0,inplace=True)
    return df.drop('target',axis=1)

### Im counting back the months from max month, but historical is rolling for each date, historical is bad becuase it double counts, or does it?

In [None]:
targs = y_train_list[0]
target_encode(df,targs,['item_id'])
g = df[(df.date_block_num<32) & (df.date_block_num>=20)].groupby('item_id').target.sum()
df['item_id_encoded_manual'] = df.item_id.map(g)
df.head()

In [None]:
21 ... 31

In [None]:
df = x_train_list[1]
targs = y_train_list[1]
df['target'] = targs
df_small = df[['item_id','item_id_encoded','target','historical_item_sales']]

In [None]:
g = df.groupby('item_id').target.sum()
g.head(40)

In [None]:
df_small['item_id_encoded_manual'] = df.item_id.map(g)

In [None]:
df_small.head()

In [None]:
df_small.head(15)

In [None]:
df[(df.item_id==32)].target.sum()

In [None]:
#df[['item_id','item_idencoded']].sort_values('item_id').head(1000)

In [None]:
df100 = df.sample(100)
df100