In [1]:
import numpy as np
import pandas as pd
import pickle
from itertools import product
import gc
import time
import re

In [2]:
DATEBACK_DIST=2

### function to downcast data types to 32 bits

In [3]:
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

In [4]:
x_train_list = pickle.load(open('../gen_data/x_train--simple_validation_split.ipynb--.pickle','rb'))
y_train_list = pickle.load(open('../gen_data/y_train--simple_validation_split.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/test_data_enriched--enrich1.ipynb--.pickle','rb'))

items_data = pd.read_csv('../original_data/items.csv')
sales = pd.read_csv('../gen_data/train1.csv')
df_item_cat = pd.read_csv('../gen_data/train2.csv')

In [5]:
x_train_list = list(map(downcast,x_train_list))
x_test_list = list(map(downcast,x_test_list))

# Generate new item variable

In [6]:
#start with sales data, figure out when each item was sold, 
    #merge this with train/test then transform it to 1/0
def new_item_var(df):
    sales2 = sales.copy()
    first_sale_date = sales2.groupby('item_id').date_block_num.min()
    sales2['first_sale_date_block'] = sales2.item_id.map(first_sale_date)
    sales2 = sales2[['item_id','first_sale_date_block']]
    sales2 = sales2.drop_duplicates()
    df = df.merge(sales2,on='item_id',how='left')
    
    #handle nans for test data (which sales2 wont pick up)
    df.first_sale_date_block.fillna(999,inplace=True)    
    df['new_item'] = (df.first_sale_date_block >= df.date_block_num)*1

    return df

In [7]:
x_train_list = list(map(new_item_var,x_train_list))
x_test_list = list(map(new_item_var,x_test_list))
del sales
gc.collect()

209

### Parse City data

In [8]:
df_city = df_item_cat.copy()

In [9]:
cities_series = pd.Series([re.search('(.*?) ',n).group() for n in df_city.shop_name],index=df_city.index)
df_city['city'] = cities_series
df_city = df_city[['shop_id','city']].drop_duplicates()
df_city = downcast(df_city)

x_train_list = list(map(lambda x: x.merge(df_city,on=['shop_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df_city,on=['shop_id'],how='left'),x_test_list))

### Parse item category data

In [10]:
item_type_series = df_item_cat.item_category_name.map(lambda x: re.split('-',x)[0])
df_item_cat['item_type'] = item_type_series

item_info_series = df_item_cat.item_category_name.map(lambda x: '-'.join(re.split('-',x)[1:]) if len(re.split('-',x))>1 else 'NAN')
df_item_cat['item_info'] = item_info_series

df_item_cat = df_item_cat[['item_category_id','item_type','item_info']].drop_duplicates()
df_item_cat = downcast(df_item_cat)

x_train_list = list(map(lambda x: x.merge(df_item_cat,on=['item_category_id'],how='left'),x_train_list))
x_test_list = list(map(lambda x: x.merge(df_item_cat,on=['item_category_id'],how='left'),x_test_list))

In [11]:
gc.collect()

91

# Some item_types have unreliable price data

In [12]:
item_types_with_unreliable_pricing = ['Игры PC ', 'Кино ', 'Игры ', 'Подарки ', 'Служебные']
def remove_unreliable_pricing(df):
    df.loc[df.item_type.isin(item_types_with_unreliable_pricing),'prop_median_item_price'] = 1
    return df

In [13]:
x_train_list = list(map(remove_unreliable_pricing,x_train_list))
x_test_list = list(map(remove_unreliable_pricing,x_test_list))

In [14]:
#drop name vars
def drop_names(df):
    return df.drop(['item_name','shop_name','item_category_name'],axis=1)

In [15]:
x_train_list = list(map(drop_names,x_train_list))
x_test_list = list(map(drop_names,x_test_list))

### Create Historical Vars

In [16]:
for train , y_train in zip(x_train_list,y_train_list):
    train['item_cnt_month'] = y_train.copy()
    

In [17]:
def merge_data(df,variables,newvarname):
    var_sales = train.groupby(variables,as_index=False).item_cnt_month.sum()
    var_sales.columns = variables + [newvarname]
    df = df.merge(var_sales,how='left')
    return df

In [18]:
def create_cat_vars_2(train):
    train = merge_data(train,['date_block_num','item_id'],'sum_item_sales_back_0')
    train = merge_data(train,['date_block_num','shop_id'],'sum_shop_sales_back_0')
    train = merge_data(train,['date_block_num','shop_id','item_id'],'item_cnt_month_back_0')
    train = merge_data(train,['date_block_num','item_category_id'],'sum_item_cat_sales_back_0')
    train = merge_data(train,['date_block_num','shop_id','item_category_id'],'sum_item_cat_shop_sales_back_0')
    train = merge_data(train,['date_block_num','city'],'sum_city_back_0')
    train = merge_data(train,['date_block_num','item_type'],'sum_itemtype_back_0')
    train = merge_data(train,['date_block_num','item_info'],'sum_iteminfo_back_0')
    train = merge_data(train,['date_block_num','city','item_id'],'sum_city_item_back_0')
    train = merge_data(train,['date_block_num','city','item_category_id'],'sum_city_item_cat_back_0')
    train = merge_data(train,['date_block_num','item_type','shop_id'],'sum_itemtype_shop_back_0')
    train = merge_data(train,['date_block_num','item_type','city'],'sum_itemtype_city_back_0')
    return train

In [20]:
x_train_list = list(map(create_cat_vars_2,x_train_list))

### Generate 0 Entries


Need to include entries where 0 sales were made for item/shop pairs in a month.
So this doesnt get out of hand, gonna focus only on all possible item/shop pairs based on sales in that month, 
this is what the coursera course did, see outside/Programming_assignment_week_4.ipynb for more info

In [21]:
def gen_zeros(train):
    months = range(train.date_block_num.min(),train.date_block_num.max()+1)
    to_pandas=[]
    print('Computing for month:',end=' ')
    for month in months:
        print(month,end=', ')
        subtrain = train[train.date_block_num==month].copy()
        all_shops = subtrain.shop_id.unique()
        all_items = subtrain.item_id.unique()
        pairs = product(all_shops,all_items)
        to_pandas.append([(month,x[0],x[1]) for x in pairs])
    
    train_filled = pd.DataFrame(np.vstack(to_pandas),columns=['date_block_num','shop_id','item_id'])
    
    #need to fill name vars
    #name_df = train[['shop_id','item_id','item_category_id','item_name','shop_name','item_category_name']]
    #train_filled = train_filled.merge(name_df)
    
    print(train_filled.shape)
    train_filled = train_filled.merge(train,how='left',on=['date_block_num','shop_id','item_id'])
    
    print(train_filled.shape)
    #Re-merge item_category_id and resort columns, this is computationally inefficient but saves time coding
    train_filled = train_filled.drop('item_category_id',axis=1).merge(items_data[['item_id','item_category_id']],on='item_id',how='left')
    cols = train_filled.columns.tolist()
    col_reseq = cols[0:3] + [cols[-1]] + cols[3:-1]
    train_filled = train_filled[col_reseq]
    #entries where item_cnt_month is supposed to be 0 are now created as NA
    train_filled.fillna(0,inplace=True)
    
    return train_filled

In [22]:
x_train_list = list(map(gen_zeros,x_train_list))

Computing for month: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, (10675678, 3)
(10675678, 23)
Computing for month: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, (10913850, 3)
(10913850, 23)


### Create Lag Values

In [23]:
newvars = ['sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0','sum_city_back_0','sum_itemtype_back_0','sum_iteminfo_back_0','sum_city_item_back_0','sum_city_item_cat_back_0','sum_itemtype_shop_back_0','sum_itemtype_city_back_0']

In [24]:
#make this more efficient?
def create_lag_train(train_filled):
    #gonna iterively copy a subset of the data, rename the date block and aome other cols then merge it back in dateback_gen = range(1,13)
    dateback_gen = range(1,DATEBACK_DIST+1)
    lag_train_filled = downcast(train_filled)
    del train_filled
    print('dateback=',end=' ')
    gc.collect()
    for dateback in dateback_gen: 
        #this line inneficient?
#       to_shift = lag_train_filled[['date_block_num','shop_id','item_id','sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0']].copy()
        to_shift = lag_train_filled[['date_block_num','shop_id','item_id']+newvars].copy()
        to_shift['date_block_num'] = to_shift.date_block_num + dateback
        newcols = ['date_block_num','shop_id','item_id'] + [x[0:-1]+str(dateback) for x in newvars]
        to_shift.columns = newcols
        #print(newcols)
        print(dateback,end=', ')
        lag_train_filled = lag_train_filled.merge(to_shift,on=['date_block_num','shop_id','item_id'],how='left').fillna(0)
        del to_shift
        gc.collect()
    
    #remove first DATEBACK_DIST many months
    lag_train_filled = lag_train_filled[lag_train_filled.date_block_num>=DATEBACK_DIST]
    return lag_train_filled

In [25]:
start = time.time()
x_train_list = list(map(create_lag_train,x_train_list))
end = time.time()
print('Runtime: '+str((end-start)/60))

dateback= 1, 2, dateback= 1, 2, Runtime: 3.1014426271120707


In [26]:
x_train_list[0].shape

(9934775, 47)

In [27]:
(time.time() - start)/60

3.1019689122835796

In [28]:
# should be (6425094, 167)
x_train_list[1][x_train_list[1].date_block_num>=12].shape
#This should have 6425094 to mimmick the course script (except this is using all of the shops)

(6425094, 47)

In [29]:
def create_lag_test(tup):
    train = tup[0]
    test = tup[1]
    test = downcast(test)
    ref_date_block = test.date_block_num.mean()
    dateback_gen = range(1,DATEBACK_DIST+1)
    print('Getting information from month:')
    for dateback in dateback_gen:
        print(str(ref_date_block - dateback),end=', ')
        #hist_data = train[train.date_block_num==ref_date_block - dateback][['date_block_num','shop_id','item_id','sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0']]
        hist_data = train[train.date_block_num==ref_date_block - dateback][['date_block_num','shop_id','item_id'] + newvars]
        hist_data.date_block_num = ref_date_block
        #hist_data.columns = ['date_block_num','shop_id','item_id','sum_item_sales_back_'+str(dateback),'sum_shop_sales_back_'+str(dateback),'item_cnt_month_back_'+str(dateback),'sum_item_cat_sales_back_'+str(dateback),'sum_item_cat_shop_sales_back_'+str(dateback)]
        newcols = ['date_block_num','shop_id','item_id'] + [x[0:-1]+str(dateback) for x in newvars]
        hist_data.columns = newcols
        test = test.merge(hist_data,how='left').fillna(0)
    return test

In [30]:
x_test_list = list(map(create_lag_test,zip(x_train_list,x_test_list)))

Getting information from month:
32.0, 31.0, Getting information from month:
33.0, 32.0, 

In [31]:
def get_y_train(x_train):
    return x_train.item_cnt_month.copy()

In [32]:
y_train_list = list(map(get_y_train,x_train_list))

In [33]:
def clear_train(train):
    sum_item_sales_back_0 = train.sum_item_sales_back_0
    sum_shop_sales_back_0 = train.sum_shop_sales_back_0
    item_cnt_month_back_0 = train.item_cnt_month_back_0

    #train = train.drop(['sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','item_cnt_month','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0'],axis=1)
    train = train.drop(['item_cnt_month'] + newvars,axis=1)
    return train

In [34]:
x_train_list = list(map(clear_train,x_train_list))

### Realign y_train with x_train

In [35]:
def reset_train_indicies(series):
    return series.reset_index().drop('index',axis=1).item_cnt_month

In [36]:
y_train_list = list(map(reset_train_indicies,y_train_list))

### Save

In [None]:
print('saving...')
pickle.dump(x_train_list,open('../gen_data/x_train--features1.1.ipynb--.pickle','wb'))
print('saving...')
pickle.dump(x_test_list,open('../gen_data/x_test--features1.1.ipynb--.pickle','wb'))
print('saving...')
pickle.dump(y_train_list,open('../gen_data/y_train--features1.1.ipynb--.pickle','wb'))