In [1]:
import numpy as np
import pandas as pd
import pickle
from itertools import product
import gc
import time

In [2]:
DATEBACK_DIST=12

In [3]:
#function to downcast data types to 32 bits
def downcast(df):
    float_cols = [col for col in df if df[col].dtype=='float64']
    int_cols = [col for col in df if df[col].dtype=='int64']

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return(df)

In [4]:
x_train_list = pickle.load(open('../gen_data/x_train--simple_validation_split.ipynb--.pickle','rb'))
y_train_list = pickle.load(open('../gen_data/y_train--simple_validation_split.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/test_data_enriched--enrich1.ipynb--.pickle','rb'))

items_data = pd.read_csv('../original_data/items.csv')

In [5]:
#drop name vars
def drop_names(df):
    return df.drop(['item_name','shop_name','item_category_name'],axis=1)

In [6]:
x_train_list = list(map(drop_names,x_train_list))
x_test_list = list(map(drop_names,x_test_list))

It can be seen that this data is missing a lot of 0 item sales

### Create Historical Vars

In [7]:
for train , y_train in zip(x_train_list,y_train_list):
    train['item_cnt_month'] = y_train.copy()
    

In [8]:
def create_cat_vars(train):
    item_sales = train.groupby(['date_block_num','item_id'],as_index=False).item_cnt_month.sum()
    item_sales.columns = ['date_block_num','item_id','sum_item_sales_back_0']
    train = train.merge(item_sales,how='left')
        #put this in x_val
        #then use OOF or permutations to get data for x_train to avoid overfitting
            #maybe just do the basic stuff for x_train, worry about overfitting later

    shop_sales = train.groupby(['date_block_num','shop_id'],as_index=False).item_cnt_month.sum()
    shop_sales.columns = ['date_block_num','shop_id','sum_shop_sales_back_0']
    train = train.merge(shop_sales,how='left')

    shop_item_sales = train.groupby(['date_block_num','shop_id','item_id'],as_index=False).agg({'item_cnt_month':'sum'})
    shop_item_sales.columns = ['date_block_num','shop_id','item_id','item_cnt_month_back_0']
    train = train.merge(shop_item_sales,how='left',on=['date_block_num','shop_id','item_id'])
    
    item_cat_sales = train.groupby(['date_block_num','item_category_id'],as_index=False).item_cnt_month.sum()
    item_cat_sales.columns = ['date_block_num','item_category_id','sum_item_cat_sales_back_0']
    train = train.merge(item_cat_sales,how='left')
    
    item_cat_shop_sales = train.groupby(['date_block_num','shop_id','item_category_id'],as_index=False).item_cnt_month.sum()
    item_cat_shop_sales.columns = ['date_block_num','shop_id','item_category_id','sum_item_cat_shop_sales_back_0']
    train = train.merge(item_cat_shop_sales,how='left')
    

    return train

In [9]:
x_train_list = list(map(create_cat_vars,x_train_list))

### Generate 0 Entries


Need to include entries where 0 sales were made for item/shop pairs in a month.
So this doesnt get out of hand, gonna focus only on all possible item/shop pairs based on sales in that month, 
this is what the coursera course did, see outside/Programming_assignment_week_4.ipynb for more info

In [10]:
def gen_zeros(train):
    months = range(train.date_block_num.min(),train.date_block_num.max()+1)
    to_pandas=[]
    print('Computing for month:',end=' ')
    for month in months:
        print(month,end=', ')
        subtrain = train[train.date_block_num==month].copy()
        all_shops = subtrain.shop_id.unique()
        all_items = subtrain.item_id.unique()

        pairs = product(all_shops,all_items)
        to_pandas.append([(month,x[0],x[1]) for x in pairs])

    train_filled = pd.DataFrame(np.vstack(to_pandas),columns=['date_block_num','shop_id','item_id'])
    
    #need to fill name vars
    #name_df = train[['shop_id','item_id','item_category_id','item_name','shop_name','item_category_name']]
    #train_filled = train_filled.merge(name_df)
    
    train_filled = train_filled.merge(train,how='left',on=['date_block_num','shop_id','item_id'])
    
    #Re-merge item_category_id and resort columns, this is computationally inefficient but saves time coding
    train_filled = train_filled.drop('item_category_id',axis=1).merge(items_data[['item_id','item_category_id']],on='item_id',how='left')
    cols = train_filled.columns.tolist()
    col_reseq = cols[0:3] + [cols[-1]] + cols[3:-1]
    train_filled = train_filled[col_reseq]
    #entries where item_cnt_month is supposed to be 0 are now created as NA
    train_filled.fillna(0,inplace=True)
    
    return train_filled

In [11]:
x_train_list = list(map(gen_zeros,x_train_list))

Computing for month: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, Computing for month: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 

### Create Lag Values

In [12]:
#make this more efficient?
def create_lag_train(train_filled):
    #gonna iterively copy a subset of the data, rename the date block and aome other cols then merge it back in dateback_gen = range(1,13)
    dateback_gen = range(1,DATEBACK_DIST+1)
    lag_train_filled = downcast(train_filled)
    del train_filled
    print('dateback=',end=' ')
    gc.collect()
    for dateback in dateback_gen: 
        #this line inneficient?
        to_shift = lag_train_filled[['date_block_num','shop_id','item_id','sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0']].copy()
        to_shift['date_block_num'] = to_shift.date_block_num + dateback
        newcols = ['date_block_num','shop_id','item_id','sum_item_sales_back_'+str(dateback),'sum_shop_sales_back_'+str(dateback),'item_cnt_month_back_'+str(dateback),'sum_item_cat_sales_back_'+str(dateback),'sum_item_cat_shop_sales_back_'+str(dateback)]
        to_shift.columns = newcols
        #print(newcols)
        print(dateback,end=', ')
        lag_train_filled = lag_train_filled.merge(to_shift,on=['date_block_num','shop_id','item_id'],how='left').fillna(0)
        del to_shift
        gc.collect()
    
    #remove first DATEBACK_DIST many months
    lag_train_filled = lag_train_filled[lag_train_filled.date_block_num>=DATEBACK_DIST]
    return lag_train_filled

In [13]:
x_train_list = list(map(create_lag_train,x_train_list))

dateback= 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, dateback= 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

In [14]:
x_train_list[1][x_train_list[1].date_block_num>=12].shape
#This should have 6425094 to mimmick the course script (except this is using all of the shops)

(6425094, 70)

In [15]:
def create_lag_test(tup):
    train = tup[0]
    test = tup[1]
    test = downcast(test)
    ref_date_block = test.date_block_num.mean()
    dateback_gen = range(1,DATEBACK_DIST+1)
    print('Getting information from month:')
    for dateback in dateback_gen:
        print(str(ref_date_block - dateback),end=', ')
        hist_data = train[train.date_block_num==ref_date_block - dateback][['date_block_num','shop_id','item_id','sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0']]
        hist_data.date_block_num = ref_date_block
        hist_data.columns = ['date_block_num','shop_id','item_id','sum_item_sales_back_'+str(dateback),'sum_shop_sales_back_'+str(dateback),'item_cnt_month_back_'+str(dateback),'sum_item_cat_sales_back_'+str(dateback),'sum_item_cat_shop_sales_back_'+str(dateback)]
        test = test.merge(hist_data,how='left').fillna(0)
    return test

In [16]:
x_test_list = list(map(create_lag_test,zip(x_train_list,x_test_list)))

Getting information from month:
32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, Getting information from month:
33.0, 32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 

# Check


In [17]:
def get_y_train(x_train):
    return x_train.item_cnt_month.copy()

In [18]:
y_train_list = list(map(get_y_train,x_train_list))

In [19]:
def clear_train(train):
    sum_item_sales_back_0 = train.sum_item_sales_back_0
    sum_shop_sales_back_0 = train.sum_shop_sales_back_0
    item_cnt_month_back_0 = train.item_cnt_month_back_0

    train = train.drop(['sum_item_sales_back_0','sum_shop_sales_back_0','item_cnt_month_back_0','item_cnt_month','sum_item_cat_sales_back_0','sum_item_cat_shop_sales_back_0'],axis=1)
    return train

In [20]:
x_train_list = list(map(clear_train,x_train_list))

### Realign y_train with x_train

In [21]:
def reset_train_indicies(series):
    return series.reset_index().drop('index',axis=1).item_cnt_month

In [22]:
y_train_list = list(map(reset_train_indicies,y_train_list))

### Save

In [23]:
print('saving...')
pickle.dump(x_train_list,open('../gen_data/x_train--features1.ipynb--.pickle','wb'))
print('saving...')
pickle.dump(x_test_list,open('../gen_data/x_test--features1.ipynb--.pickle','wb'))
print('saving...')
pickle.dump(y_train_list,open('../gen_data/y_train--features1.ipynb--.pickle','wb'))

saving...
saving...
saving...
