In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
x_train_list = pickle.load(open('../gen_data/x_train--features2.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features2.ipynb--.pickle','rb'))

sales = pd.read_csv('../original_data/sales_train.csv.gz')

# downcast here, (actually just import downcasted sales data) 

### Generate time specific market price data

In [3]:
mean_price = sales.groupby(['date_block_num','item_id'],as_index=True).item_price.mean()
std_price = sales.groupby(['date_block_num','item_id'],as_index=True).item_price.std(ddof=1)
median_price = sales.groupby(['date_block_num','item_id'],as_index=True).item_price.median()

price_data = pd.concat([mean_price,std_price,median_price],axis=1)
price_data.columns = ['mean_prevmonth_item_price','std_prevmonth_item_price','median_prevmonth_item_price']
price_data.reset_index(inplace=True)
price_data['date_block_num'] += 1

### Generate shop-time price data

In [4]:
#not gonna worry about getting mean price by transaction, will 
#settle for mean price by row (which could incnlude many transactions)
shop_item_data = sales.groupby(['date_block_num','shop_id','item_id'],as_index=False).item_price.mean()
shop_item_data['date_block_num'] += 1
shop_item_data.columns = ['date_block_num','shop_id','item_id','mean_prevmonth_shop_item_price']
shop_item_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,mean_prevmonth_shop_item_price
0,1,0,32,221.0
1,1,0,33,347.0
2,1,0,35,247.0
3,1,0,43,221.0
4,1,0,51,128.5


In [5]:
def merge_price_data(df):
    return df.merge(price_data,how='left').merge(shop_item_data,how='left')

In [6]:
new_x_train_list = list(map(merge_price_data,x_train_list))
new_x_test_list = list(map(merge_price_data,x_test_list))

In [7]:
def insert_sd_above_market(df):
    df['std_above_market'] = (df.mean_prevmonth_shop_item_price - df.mean_prevmonth_item_price) / df.std_prevmonth_item_price
    return df

In [8]:
def cleanup(df):
    mean_sd = np.nanmean(df.std_prevmonth_item_price)
    df['std_prevmonth_item_price'] = df.std_prevmonth_item_price.fillna(mean_sd)
    df['std_above_market'] = df.std_above_market.fillna(0)
    return df

In [9]:
new_x_train_list = list(map(insert_sd_above_market,new_x_train_list))
new_x_test_list = list(map(insert_sd_above_market,new_x_test_list))

In [10]:
new_x_train_list = list(map(cleanup,new_x_train_list))
new_x_test_list = list(map(cleanup,new_x_test_list))

In [12]:
new_x_train_list[1]

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,sum_item_sales_back_1,sum_shop_sales_back_1,item_cnt_month_back_1,sum_item_cat_sales_back_1,sum_item_cat_shop_sales_back_1,sum_item_sales_back_2,...,item_cnt_month_back_12,sum_item_cat_sales_back_12,sum_item_cat_shop_sales_back_12,month,numdays,mean_prevmonth_item_price,std_prevmonth_item_price,median_prevmonth_item_price,mean_prevmonth_shop_item_price,std_above_market
0,12,2,32,40,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,jan,31,147.994444,8.253484,149.00,,0.000000
1,12,2,33,37,42.0,1255.0,1.0,9906.0,55.0,15.0,...,1.0,5982.0,52.0,jan,31,195.900000,13.684729,199.00,199.000,0.226530
2,12,2,99,37,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,jan,31,124.167083,25.367019,124.17,,0.000000
3,12,2,482,73,88.0,1255.0,1.0,412.0,3.0,85.0,...,1.0,664.0,7.0,jan,31,3233.461538,237.704536,3300.00,3300.000,0.279921
4,12,2,485,73,111.0,1255.0,1.0,412.0,3.0,0.0,...,0.0,0.0,0.0,jan,31,298.222222,17.688666,300.00,300.000,0.100504
5,12,2,804,49,26.0,1255.0,1.0,842.0,7.0,0.0,...,1.0,915.0,8.0,jan,31,236.200000,19.000000,240.00,240.000,0.200000
6,12,2,839,73,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,664.0,7.0,jan,31,3250.256410,310.648618,3300.00,,0.000000
7,12,2,1007,67,247.0,1255.0,1.0,3576.0,10.0,0.0,...,3.0,2202.0,28.0,jan,31,593.667277,32.302652,599.00,599.000,0.165086
8,12,2,1406,28,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,jan,31,935.535514,159.340390,999.00,,0.000000
9,12,2,1407,19,466.0,1255.0,13.0,14490.0,278.0,6.0,...,0.0,0.0,0.0,jan,31,1254.957967,142.609768,1299.00,1299.000,0.308829


### Save

In [11]:
pickle.dump(new_x_train_list,open('../gen_data/x_train--features3.ipynb--.pickle','wb'))
pickle.dump(new_x_test_list,open('../gen_data/x_test--features3.ipynb--.pickle','wb'))