In [4]:
import pandas as pd
import numpy as np
import pickle

In [5]:
SAVE_TIME=False

In [6]:
x_train_list = pickle.load(open('../gen_data/x_train--features2.ipynb--.pickle','rb'))
x_test_list = pickle.load(open('../gen_data/x_test--features2.ipynb--.pickle','rb'))

sales = pd.read_csv('../original_data/sales_train.csv.gz')

### Generate time specific market price data

In [7]:
if not SAVE_TIME:
    months = range(sales.date_block_num.min() , sales.date_block_num.max() +1 )
    #months=range(1,2)
    #months=range(19,20)
    to_df=[]
    print('Generating variables related to month: ')
    for month in months:
        print(month,end=', ')
        dfm = sales[sales.date_block_num==(month-1)]
        items= dfm.item_id.unique()
        prices=[]
        for item in items:
            dfmi = dfm[dfm.item_id==item]
            pricesum = dfmi.item_price.sum()
            count = dfmi.item_cnt_day.sum()

            #sdprice=np.nan

            if count>0:
                meanprice=pricesum/count
            else:
                meanprice=np.nan
                
            if count>1:
                ss = np.sum(dfmi.item_cnt_day*(dfmi.item_price-meanprice)**2)
                sdprice = np.sqrt( ss/(count-1) )
            else:
                sdprice=np.nan
            #sdprice=np.sqrt(dfmi.item_cnt_day.sum()*(price - meanprice)**2/(count-1))
            to_df.append([month,item,meanprice,sdprice])

    price_data = pd.DataFrame(np.vstack(to_df),columns=['date_block_num','item_id','mean_prevmonth_item_price','sd_prevmonth_item_price'])

Generating variables related to month: 
0, 1, 2, 3, 4, 5, 



6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 

### add median price (approximation, because some rows have more than 1 sale)

In [8]:
price_data.head()

Unnamed: 0,date_block_num,item_id,mean_prevmonth_item_price,sd_prevmonth_item_price
0,1.0,22154.0,999.0,0.0
1,1.0,2552.0,,
2,1.0,2554.0,1709.05,
3,1.0,2555.0,1098.85,0.212132
4,1.0,2564.0,327.6,47.851855


In [6]:
#save time
if SAVE_TIME:
    price_data = pd.read_csv('../gen_data/price_data_for_features3.csv')
else:
    price_data.to_csv('../gen_data/price_data_for_features3.csv',index=False)

In [7]:
price_data['date_block_num'] = price_data.date_block_num.astype(np.int32)
price_data['item_id'] = price_data.item_id.astype(np.int32)
price_data.head()

Unnamed: 0,date_block_num,item_id,mean_prevmonth_item_price,sd_prevmonth_item_price
0,1,22154,999.0,0.0
1,1,2552,,
2,1,2554,1709.05,
3,1,2555,1098.85,0.212132
4,1,2564,327.6,47.851855


### Generate shop-time price data

In [8]:
#not gonna worry about getting mean price by transaction, will 
#settle for mean price by row (which could incnlude many transactions)
shop_item_data = sales.groupby(['date_block_num','shop_id','item_id'],as_index=False).item_price.mean()
shop_item_data['date_block_num'] += 1
shop_item_data.columns = ['date_block_num','shop_id','item_id','mean_prevmonth_shop_item_price']
shop_item_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,mean_prevmonth_shop_item_price
0,1,0,32,221.0
1,1,0,33,347.0
2,1,0,35,247.0
3,1,0,43,221.0
4,1,0,51,128.5


In [9]:
def merge_price_data(df):
    return df.merge(price_data,how='left').merge(shop_item_data,how='left')

In [10]:
new_x_train_list = list(map(merge_price_data,x_train_list))
new_x_test_list = list(map(merge_price_data,x_test_list))

In [11]:
def insert_sd_above_market(df):
    df['sd_above_market'] = (df.mean_prevmonth_shop_item_price - df.mean_prevmonth_item_price) / df.sd_prevmonth_item_price
    return df

In [12]:
def cleanup(df):
    mean_sd = np.nanmean(df.sd_prevmonth_item_price)
    df['sd_prevmonth_item_price'] = df.sd_prevmonth_item_price.fillna(mean_sd)
    df['sd_above_market'] = df.sd_above_market.fillna(0)
    return df

In [13]:
new_x_train_list = list(map(insert_sd_above_market,new_x_train_list))
new_x_test_list = list(map(insert_sd_above_market,new_x_test_list))

In [14]:
new_x_train_list = list(map(cleanup,new_x_train_list))
new_x_test_list = list(map(cleanup,new_x_test_list))

  


### Save

In [15]:
pickle.dump(new_x_train_list,open('../gen_data/x_train--features3.ipynb--.pickle','wb'))
pickle.dump(new_x_test_list,open('../gen_data/x_test--features3.ipynb--.pickle','wb'))