# Build a DF relating item category name and shop name to shop ids and item ids

### Load packages

In [1]:
import pandas as pd
import pickle
import numpy as np
import re
import gc
from sklearn import preprocessing

### Read in Data

In [2]:
df = pd.read_csv('../gen_data/train2.csv')

y_train_list = pickle.load(open('../gen_data/y_train--features1.ipynb--.pickle','rb'))
x_train_list = pickle.load(open('../gen_data/x_train--features3.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features3.ipynb--.pickle','rb'))

### Parse City data

In [3]:
cities_encoder = preprocessing.LabelEncoder()
cities_series = pd.Series([re.search('(.*?) ',n).group() for n in df.shop_name],index=df.index)
cities_encoder.fit(cities_series)
df['city'] = cities_encoder.transform(cities_series)
del cities_series

### Parse item category data

In [4]:
item_type_encoder = preprocessing.LabelEncoder()
item_type_series = df.item_category_name.map(lambda x: re.split('-',x)[0])
item_type_encoder.fit(item_type_series)
df['item_type'] = item_type_encoder.transform(item_type_series)
del item_type_series

### parse more item_category data

In [5]:
item_info_encoder = preprocessing.LabelEncoder()
item_info_series = df.item_category_name.map(lambda x: '-'.join(re.split('-',x)[1:]) if len(re.split('-',x))>1 else 'NAN')
item_info_encoder.fit(item_info_series)
df['item_info'] = item_info_encoder.transform(item_info_series)
del item_info_series

### Select relevant columns in df

In [6]:
df = df[['shop_id','item_id','item_category_id','city','item_type','item_info']].drop_duplicates()

### function to downcast data types to 32 bits

In [7]:
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

### cleanup

In [8]:
df = downcast(df)
gc.collect()

7

### Merge data above into training and test sets

In [9]:
x_train_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df,on=['item_id','item_category_id','shop_id'],how='left'),x_test_list))

# Target Encoding

Target encoding is based on historical information, available for both train and test datasets, as such, we needn't worry about regularisation specifically for target encodings.

# SHOULDNT BE USING Y_TRAIN_LIST[0]

In [92]:
x_train_list = [target_encode(x_train,y_train,['shop_id','item_id','item_category_id','month','city','item_type','item_info'],fillna=0) for x_train , y_train in zip(x_train_list,y_train_list)]
x_test_list = [target_encode(x_test,y_test,['shop_id','item_id','item_category_id','month','city','item_type','item_info'],fillna=0) for x_test , y_test in zip(x_test_list,y_test_list)]

Encoding variable: shop_id
Encoding variable: item_id
Encoding variable: item_category_id
Encoding variable: month
Encoding variable: city
Encoding variable: item_type
Encoding variable: item_info
Encoding variable: shop_id
Encoding variable: item_id
Encoding variable: item_category_id
Encoding variable: month
Encoding variable: city
Encoding variable: item_type
Encoding variable: item_info


In [12]:
# need to add bool: item never sold before
                    #shop never sold before
# and int: num item sold
# num shop sold
# FOR EACH MONTH

#this is target encoding filling nans with 0 except missing early data

#doing this early in pipeline would be easy

#going back for all time might be a bad idea, the nature of the data would change thru time, (summingup)
#instaed just use dateback range

In [13]:
[print(n) for n in x_train_list[0].columns]

date_block_num
shop_id
item_id
item_category_id
sum_item_sales_back_1
sum_shop_sales_back_1
item_cnt_month_back_1
sum_item_cat_sales_back_1
sum_item_cat_shop_sales_back_1
sum_item_sales_back_2
sum_shop_sales_back_2
item_cnt_month_back_2
sum_item_cat_sales_back_2
sum_item_cat_shop_sales_back_2
sum_item_sales_back_3
sum_shop_sales_back_3
item_cnt_month_back_3
sum_item_cat_sales_back_3
sum_item_cat_shop_sales_back_3
sum_item_sales_back_4
sum_shop_sales_back_4
item_cnt_month_back_4
sum_item_cat_sales_back_4
sum_item_cat_shop_sales_back_4
sum_item_sales_back_5
sum_shop_sales_back_5
item_cnt_month_back_5
sum_item_cat_sales_back_5
sum_item_cat_shop_sales_back_5
sum_item_sales_back_6
sum_shop_sales_back_6
item_cnt_month_back_6
sum_item_cat_sales_back_6
sum_item_cat_shop_sales_back_6
sum_item_sales_back_7
sum_shop_sales_back_7
item_cnt_month_back_7
sum_item_cat_sales_back_7
sum_item_cat_shop_sales_back_7
sum_item_sales_back_8
sum_shop_sales_back_8
item_cnt_month_back_8
sum_item_cat_sales_back_8

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [14]:
dfout = x_train_list[0]

In [15]:
df_items = dfout.filter(regex='sum_item_sales_back_\d+$')
df_items.sum(axis=1)
dfout.shape

(6186922, 80)

In [16]:
#create variables to quantify how much we know about a data point
#For each shop_id and item_id create sum of recent history(total sales for dateback_range)
#create bool if 0 indicating new shop or item
def generate_recent_history_summary(df):
    df_items = df.filter(regex='sum_item_sales_back_\d+$')
    df['historical_item_sales'] = df_items.sum(axis=1)
    df['new_item'] = (df.historical_item_sales<0.5)*1
    
    df_shops = df.filter(regex='sum_shop_sales_back_\d+$')
    df['historical_shop_sales'] = df_shops.sum(axis=1)
    df['new_shop'] = (df.historical_shop_sales<0.5)*1
    
    df_item_cats = df.filter(regex='sum_item_cat_sales_back_\d+$')
    df['historical_item_cat_sales'] = df_item_cats.sum(axis=1)
    df['new_item_cat'] = (df.historical_item_cat_sales<0.5)*1
    return df
    

In [17]:
x_train_list = list(map(generate_recent_history_summary,x_train_list))
x_test_list = list(map(generate_recent_history_summary,x_test_list))

In [18]:
x_train_list[0][['item_id','historical_item_sales']]

Unnamed: 0,item_id,historical_item_sales
0,32,0.0
1,33,118.0
2,99,0.0
3,482,1097.0
4,485,818.0
5,804,144.0
6,839,307.0
7,1007,463.0
8,1406,16.0
9,1407,726.0


### Save

In [19]:
pickle.dump(x_train_list,open('../gen_data/x_train--strings.ipynb--.pickle','wb'))
pickle.dump(x_test_list,open('../gen_data/x_test--strings.ipynb--.pickle','wb'))

In [95]:
df = x_train_list[0][['date_block_num','item_id','item_id_encoded','historical_item_sales']].copy()
df.columns = ['date_block_num','item_id','ORIGINAL_item_id_encoded','historical_item_sales']
df.head()

Unnamed: 0,date_block_num,item_id,ORIGINAL_item_id_encoded,historical_item_sales
0,12,32,697.0,0.0
1,12,33,402.0,118.0
2,12,99,71.0,0.0
3,12,482,1425.0,1097.0
4,12,485,1027.0,818.0


In [109]:
#work on this
#this isnt working properly, drop it, I have almost the same information from elsewhere
def target_encode(df,target,cols,fillna=0):
    maxmonth = df.date_block_num.max()
    print(maxmonth)
    df['target'] = target
    for col in cols:
        print('Encoding variable: '+col)
        #dont use current data, that way we can do this for both train and test sets
        agged_targ = df[df.date_block_num<maxmonth].groupby(col).target.sum()
        agged_targ = df[(df.date_block_num<maxmonth) & (df.date_block_num>=(maxmonth-12))].groupby(col).target.sum()
        #agged_targ = df[df.date_block_num>20].groupby(col).target.sum()
        
        
        
        df[col+'_encoded'] = df[col].map(agged_targ)
        if fillna=='mean':
            df[col].fillna(df(col).mean(),inplace=True)
        elif fillna=='median':
            df[col].fillna(df(col).median(),inplace=True)
        else:
            df[col].fillna(0,inplace=True)
    return df.drop('target',axis=1)

### Im counting back the months from max month, but historical is rolling for each date, historical is bad becuase it double counts, or does it?

In [115]:
targs = y_train_list[0]
target_encode(df,targs,['item_id'])
g = df[(df.date_block_num<32) & (df.date_block_num>=20)].groupby('item_id').target.sum()
df['item_id_encoded_manual'] = df.item_id.map(g)
df.head()

32
Encoding variable: item_id


Unnamed: 0,date_block_num,item_id,ORIGINAL_item_id_encoded,historical_item_sales,target,item_id_encoded,item_id_encoded_manual
0,12,32,697.0,0.0,1.0,374.0,374.0
1,12,33,402.0,118.0,1.0,198.0,198.0
2,12,99,71.0,0.0,1.0,4.0,4.0
3,12,482,1425.0,1097.0,2.0,735.0,735.0
4,12,485,1027.0,818.0,1.0,65.0,65.0


In [None]:
21 ... 31

In [45]:
df = x_train_list[1]
targs = y_train_list[1]
df['target'] = targs
df_small = df[['item_id','item_id_encoded','target','historical_item_sales']]

In [46]:
g = df.groupby('item_id').target.sum()
g.head(40)

item_id
0       1.0
1       6.0
2       2.0
3       2.0
4       1.0
5       1.0
6       1.0
7       1.0
8       2.0
9       1.0
10      1.0
11      1.0
13      1.0
14      1.0
15      1.0
16      1.0
17      1.0
18      1.0
20      1.0
21      1.0
22      1.0
23      1.0
24      1.0
25      1.0
26      1.0
27      6.0
28      1.0
29      1.0
30    301.0
31    342.0
32    719.0
33    418.0
34     13.0
35      2.0
36     25.0
37     26.0
38     67.0
39      3.0
40     35.0
41      1.0
Name: target, dtype: float32

In [47]:
df_small['item_id_encoded_manual'] = df.item_id.map(g)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [48]:
df_small.head()

Unnamed: 0,item_id,item_id_encoded,target,historical_item_sales,item_id_encoded_manual
0,32,697.0,1.0,0.0,719.0
1,33,402.0,1.0,118.0,418.0
2,99,71.0,1.0,0.0,71.0
3,482,1425.0,2.0,1097.0,1471.0
4,485,1027.0,1.0,818.0,1027.0


In [30]:
df_small.head(15)

Unnamed: 0,item_id,item_id_encoded,target
0,32,697.0,1.0
1,33,402.0,1.0
2,99,71.0,1.0
3,482,1425.0,2.0
4,485,1027.0,1.0
5,804,27.0,1.0
6,839,602.0,1.0
7,1007,507.0,1.0
8,1406,979.0,1.0
9,1407,229.0,0.0


In [36]:
df[(df.item_id==32)].target.sum()

719.0

In [28]:
#df[['item_id','item_idencoded']].sort_values('item_id').head(1000)

In [51]:
df100 = df.sample(100)
df100

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,sum_item_sales_back_1,sum_shop_sales_back_1,item_cnt_month_back_1,sum_item_cat_sales_back_1,sum_item_cat_shop_sales_back_1,sum_item_sales_back_2,...,item_type_encoded,item_info_encoded,historical_item_sales,new_item,historical_shop_sales,new_shop,historical_item_cat_sales,new_item_cat,target,item_id_encoded_manual
6192161,33,2,11275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,7.0
3934805,23,59,3902,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,,,21.0,0,2156.0,0,20520.0,0,0.0,152.0
2080469,18,16,7821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,73.0
1587007,16,46,17882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,239.0
1692345,17,4,2446,0.0,0.0,0.0,0.0,0.0,0.0,59.0,...,,,644.0,0,8263.0,0,54742.0,0,0.0,653.0
519371,13,31,2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,10.0,0,9042.0,0,11539.0,0,0.0,27.0
984101,14,55,19154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,42.0
2578428,19,47,9065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,352.0
2965790,21,2,16316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,1,0.0,1,0.0,1,0.0,454.0
4767187,26,52,7794,29.0,0.0,0.0,0.0,0.0,0.0,0.0,...,307099.0,9100.0,716.0,0,8358.0,0,3626.0,0,0.0,830.0
