# Build a DF relating item category name and shop name to shop ids and item ids

### Load packages

In [1]:
import pandas as pd
import pickle
import numpy as np
import re
import gc
from sklearn import preprocessing

### Read in Data

In [None]:
df = pd.read_csv('../gen_data/train2.csv')

y_train_list = pickle.load(open('../gen_data/y_train--features1.ipynb--.pickle','rb'))
x_train_list = pickle.load(open('../gen_data/x_train--features3.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features3.ipynb--.pickle','rb'))

### Parse City data

In [None]:
cities_encoder = preprocessing.LabelEncoder()
cities_series = pd.Series([re.search('(.*?) ',n).group() for n in df.shop_name],index=df.index)
cities_encoder.fit(cities_series)
df['city'] = cities_encoder.transform(cities_series)
del cities_series

### Parse item category data

In [None]:
item_type_encoder = preprocessing.LabelEncoder()
item_type_series = df.item_category_name.map(lambda x: re.split('-',x)[0])
item_type_encoder.fit(item_type_series)
df['item_type'] = item_type_encoder.transform(item_type_series)
del item_type_series

### parse more item_category data

In [None]:
item_info_encoder = preprocessing.LabelEncoder()
item_info_series = df.item_category_name.map(lambda x: '-'.join(re.split('-',x)[1:]) if len(re.split('-',x))>1 else 'NAN')
item_info_encoder.fit(item_info_series)
df['item_info'] = item_info_encoder.transform(item_info_series)
del item_info_series

### Select relevant columns in df

In [None]:
df = df[['shop_id','item_id','item_category_id','city','item_type','item_info']].drop_duplicates()

### function to downcast data types to 32 bits

In [None]:
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

### cleanup

In [None]:
df = downcast(df)
gc.collect()

### Merge data above into training and test sets

In [None]:
x_train_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_test_list))

# Target Encoding

Target encoding is based on historical information, available for both train and test datasets, as such, we needn't worry about regularisation specifically for target encodings.

In [None]:
def target_encode(df,target,cols,fillna=0):
    df['target'] = target
    for col in cols:
        print('Encoding variable: '+col)
        agged_targ = df.groupby(col).target.sum()
        #agged_targ = df[df.date_block_num>20].groupby(col).target.sum()
        df[col] = df[col].map(agged_targ)
        if fillna=='mean':
            df[col].fillna(df(col).mean(),inplace=True)
        elif fillna=='median':
            df[col].fillna(df(col).median(),inplace=True)
        else:
            df[col].fillna(0,inplace=True)
    return df.drop('target',axis=1)

In [None]:
x_train_list = [target_encode(train,y_train_list[0],['shop_id','item_id','item_category_id','month','city','item_type','item_info'],fillna=0) for train in x_train_list]
x_test_list = [target_encode(test,y_train_list[0],['shop_id','item_id','item_category_id','month','city','item_type','item_info'],fillna=0) for test in x_test_list]

In [None]:
# need to add bool: item never sold before
                    #shop never sold before
# and int: num item sold
# num shop sold
# FOR EACH MONTH

#this is target encoding filling nans with 0 except missing early data

#doing this early in pipeline would be easy

#going back for all time might be a bad idea, the nature of the data would change thru time, (summingup)
#instaed just use dateback range

In [None]:
[print(n) for n in x_train_list[0].columns]

In [None]:
dfout = x_train_list[0]

In [None]:
df_items = dfout.filter(regex='sum_item_sales_back_\d+$')
df_items.sum(axis=1)
dfout.shape

In [None]:
#create variables to quantify how much we know about a data point
#For each shop_id and item_id create sum of recent history(total sales for dateback_range)
#create bool if 0 indicating new shop or item
def generate_recent_history_summary(df):
    df_items = df.filter(regex='sum_item_sales_back_\d+$')
    df['historical_item_sales'] = df_items.sum(axis=1)
    df['new_item'] = (df.historical_item_sales<0.5)*1
    
    df_shops = df.filter(regex='sum_shop_sales_back_\d+$')
    df['historical_shop_sales'] = df_shops.sum(axis=1)
    df['new_shop'] = (df.historical_shop_sales<0.5)*1
    
    df_item_cats = df.filter(regex='sum_item_cat_sales_back_\d+$')
    df['historical_item_cat_sales'] = df_item_cats.sum(axis=1)
    df['new_item_cat'] = (df.historical_item_cat_sales<0.5)*1
    return df
    

In [None]:
x_train_list = list(map(generate_recent_history_summary,x_train_list))
x_test_list = list(map(generate_recent_history_summary,x_test_list))

In [None]:
x_train_list[0][['item_id','historical_item_sales']]

### Save

In [None]:
pickle.dump(x_train_list,open('../gen_data/x_train--strings.ipynb--.pickle','wb'))
pickle.dump(x_test_list,open('../gen_data/x_test--strings.ipynb--.pickle','wb'))