# Build a DF relating item category name and shop name to shop ids and item ids

In [1]:
TARGET_ENCODING_NPERM = 8
KFOLD_K = 5
#KFOLD_ALPHAS=[1000,5000,10000]
#KFOLD_ALPHAS = [0,50,100]
#KFOLD_ALPHAS = [0,50,100,200,500,1000,2000]
KFOLD_ALPHAS = [0]

In [2]:
str(KFOLD_ALPHAS)

'[0]'

### Load packages

In [3]:
import pandas as pd
import pickle
import numpy as np
import re
import gc
from sklearn import preprocessing
from sklearn import model_selection

### Read in Data

In [4]:
df_item_cat = pd.read_csv('../gen_data/train2.csv')
y_train_list = pickle.load(open('../gen_data/y_train--features1.ipynb--.pickle','rb'))
x_train_list = pickle.load(open('../gen_data/x_train--features3.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features3.ipynb--.pickle','rb'))
sales = pd.read_csv('../gen_data/train1.csv')

In [5]:
df_city = df_item_cat.copy()

# Generate new item variable

In [6]:
#start with sales data, figure out when each item was sold, 
    #merge this with train/test then transform it to 1/0
#fillna with new item?
def new_item_var(df):
    sales2 = sales.copy()
    first_sale_date = sales2.groupby('item_id').date_block_num.min()
    sales2['first_sale_date_block'] = sales2.item_id.map(first_sale_date)
    sales2 = sales2[['item_id','first_sale_date_block']]
    sales2 = sales2.drop_duplicates()
    df = df.merge(sales2,on='item_id',how='left')
    
    #handle nans for test data (which sales2 wont pick up)
    df.first_sale_date_block.fillna(999,inplace=True)    
    df['new_item'] = (df.first_sale_date_block >= df.date_block_num)*1

    return df

In [7]:
x_train_list = list(map(new_item_var,x_train_list))
x_test_list = list(map(new_item_var,x_test_list))
del sales
gc.collect()

216

### function to downcast data types to 32 bits

In [8]:
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

### Parse City data

In [9]:
cities_series = pd.Series([re.search('(.*?) ',n).group() for n in df_city.shop_name],index=df_city.index)
df_city['city'] = cities_series
df_city = df_city[['shop_id','city']].drop_duplicates()
df_city = downcast(df_city)

x_train_list = list(map(lambda x: x.merge(df_city,on=['shop_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df_city,on=['shop_id'],how='left'),x_test_list))

### Parse item category data

In [10]:
item_type_series = df_item_cat.item_category_name.map(lambda x: re.split('-',x)[0])
df_item_cat['item_type'] = item_type_series

item_info_series = df_item_cat.item_category_name.map(lambda x: '-'.join(re.split('-',x)[1:]) if len(re.split('-',x))>1 else 'NAN')
df_item_cat['item_info'] = item_info_series

df_item_cat = df_item_cat[['item_category_id','item_type','item_info']].drop_duplicates()
df_item_cat = downcast(df_item_cat)

x_train_list = list(map(lambda x: x.merge(df_item_cat,on=['item_category_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df_item_cat,on=['item_category_id'],how='left'),x_test_list))

In [11]:
gc.collect()

91

# Some item_types have unreliable price data

In [12]:
item_types_with_unreliable_pricing = ['Игры PC ', 'Кино ', 'Игры ', 'Подарки ', 'Служебные']
def remove_unreliable_pricing(df):
    df.loc[df.item_type.isin(item_types_with_unreliable_pricing),'prop_median_item_price'] = 1
    #df.loc[df.item_type.isin(item_types_with_unreliable_pricing),'median_prevmonth_item_price']
    #df.loc[df.item_type.isin(item_types_with_unreliable_pricing),'median_prevmonth_shop_item_price']
    
    return df

In [13]:
x_train_list = list(map(remove_unreliable_pricing,x_train_list))
x_test_list = list(map(remove_unreliable_pricing,x_test_list))

### it is possible (should check) that all of the !Якутск transactions were early and thus not picked up, should still fix because datebackgen will change

# Target Encoding

Target encoding is based on historical information, available for both train and test datasets, as such, we needn't worry about regularisation specifically for target encodings.

In [14]:
def kfold_target_encode(x_train,y_train,x_test,cols,k=5,alpha=0):
    x_train['target'] = y_train
    globmean = x_train.target.mean()
#for test data
    for col in cols:
        agged_mean = x_train.groupby(col).target.mean()
        x_test[col+'_encoded_alpha_'+str(alpha)] = x_test[col].map(agged_mean)
        x_test[col+'_encoded_alpha_'+str(alpha)].fillna(globmean,inplace=True)
            
#for train data, need to regularise
    #ensure index is reset to avoid breaking later
    

    x_train.reset_index(drop=True,inplace=True)
    x_train[col+'_encoded_alpha_'+str(alpha)]=np.nan
    for col in cols:
        print('Encoding '+col)
        folds = model_selection.KFold(n_splits=k,shuffle=True,random_state=0)
        for compute_ind , map_ind in folds.split(x_train):
            mean = x_train.iloc[compute_ind].groupby(col).target.mean()
            count = x_train.iloc[compute_ind].groupby(col).target.count()
            encoding = (mean*count + globmean*alpha)/(alpha+count)
            x_train.loc[map_ind,col+'_encoded_alpha_'+str(alpha)] = x_train.loc[map_ind,col].map(encoding) 
            x_train[col+'_encoded_alpha_'+str(alpha)].fillna(globmean,inplace=True)
    return x_train.drop('target',axis=1) , x_test

# HACK

In [15]:
y_train_list[0] = y_train_list[0][x_train_list[0].date_block_num>=25]

In [16]:
pickle.dump(y_train_list[0],open('../gen_data/saved_y_train_list[0].pickle','wb'))

In [17]:
def kfold_target_encode(x_train,y_train,x_test,cols,k=5,alpha=0):
    x_train['target'] = y_train
    
    #HACKS
    x_train = x_train[x_train.date_block_num>=25].copy()
    #y_train = y_train[x_train.date_block_num>=30].copy()
    ###
    
    globmean = x_train.target.mean()
#for test data
    for col in cols:
        agged_mean = x_train.groupby(col).target.mean()
        x_test[col+'_encoded_alpha_'+str(alpha)] = x_test[col].map(agged_mean)
        x_test[col+'_encoded_alpha_'+str(alpha)].fillna(globmean,inplace=True)
            
#for train data, need to regularise
    #ensure index is reset to avoid breaking later
    

    x_train.reset_index(drop=True,inplace=True)
    x_train[col+'_encoded_alpha_'+str(alpha)]=np.nan
    for col in cols:
        print('Encoding '+col)
        folds = model_selection.KFold(n_splits=k,shuffle=True,random_state=0)
        for compute_ind , map_ind in folds.split(x_train):
            mean = x_train.iloc[compute_ind].groupby(col).target.mean()
            count = x_train.iloc[compute_ind].groupby(col).target.count()
            encoding = (mean*count + globmean*alpha)/(alpha+count)
            x_train.loc[map_ind,col+'_encoded_alpha_'+str(alpha)] = x_train.loc[map_ind,col].map(encoding) 
            x_train[col+'_encoded_alpha_'+str(alpha)].fillna(globmean,inplace=True)
    return x_train.drop('target',axis=1) , x_test

In [18]:
#### Parallelise this

In [19]:
#to_target_encode = ['shop_id','item_id','item_category_id','month','city','item_type','item_info']
to_target_encode = ['shop_id','item_id']
for alpha in KFOLD_ALPHAS:
    print('alpha='+str(alpha))
    jumbled_list = [kfold_target_encode(x_train,y_train,x_test,to_target_encode,KFOLD_K,alpha=alpha) for x_train , y_train, x_test in zip(x_train_list,y_train_list,x_test_list)]
    x_train_list , x_test_list = list(map(list,zip(*jumbled_list)))

alpha=0
Encoding shop_id
Encoding item_id
Encoding shop_id
Encoding item_id


In [20]:
#x_train_list[0]

In [21]:
#y_train_list[0][y_train_list]

In [22]:
#y_train_list[0][x_train_list[0].date_block_num>=30]

### Save

In [23]:
pickle.dump(x_train_list,open('../gen_data/x_train_'+str(KFOLD_ALPHAS)+'_--features4.ipynb--.pickle','wb'))
pickle.dump(x_test_list,open('../gen_data/x_test_'+str(KFOLD_ALPHAS)+'_--features4.ipynb--.pickle','wb'))