In [4]:
def add_features(train, test):
    # Making flag is it is the first(release) month for (shop_id, item_id)
    first_month = train.groupby(['shop_id', 'item_id'])['date_block_num'].min()

    train['new_item'] = train.apply(lambda x: x['date_block_num'] == first_month[(x['shop_id'], x['item_id'])], axis='columns')
    test['new_item'] = test.apply(lambda x: not ((x['shop_id'], x['item_id']) in first_month.index), axis='columns')
    
    

    # Add prev month sales for (shop_id, item_id)
    shop_item_db = train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].mean()

    def make_prev_month_sales(x):
        if (x['shop_id'], x['item_id'], x['date_block_num'] - 1) in shop_item_db.index:
            return shop_item_db[(x['shop_id'], x['item_id'], x['date_block_num'] - 1)]
        else:
            return 0

    train['prev_month_sales'] = train.apply(make_prev_month_sales, axis='columns')
    test['prev_month_sales'] = test.apply(make_prev_month_sales, axis='columns')
    
    
    # Add item category id
    items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
    train['cat_id'] = train['item_id'].map(lambda x: items.loc[x, 'item_category_id'])
    test['cat_id'] = test['item_id'].map(lambda x: items.loc[x, 'item_category_id'])


    # Shop categories by location
    def make_shop_loc(x):
        # Yakutsk
        if x in [0, 1, 57, 58]: 
            return 'Yakutsk 4'
        # Moscow area
        elif x in [3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 54]:
            return 'Moscow 16'
        # Voronej
        elif x in [6, 7, 8]:
            return 'Voronej 3'
        # Online
        elif x in [9, 12, 55]:
            return 'Online 3'
        # Jukovski
        elif x in [10, 11]:
            return 'Jukovski 2'
        # Kazan
        elif x in [13, 14]:
            return 'Kazan 2'
        # Krasnoyarsk
        elif x in [17, 18]:
            return 'Krasnoyarsk 2'
        # NNovgorod
        elif x in [35, 36]:
            return 'NNovgorod 2'
        # Novosib
        elif x in [36, 37]:
            return 'Novosib 2'
        # Rostov
        elif x in [39, 40, 41]:
            return 'Rostov 2'
        # Spb
        elif x in [42, 43]:
            return 'Spb 2'
        # Samara
        elif x in [44, 45]:
            return 'Samara 2'
        # Tumen
        elif x in [49, 50, 51]:
            return 'Tumen 2'
        # Ufa
        elif x in [52, 53]:
            return 'Ufa 2'
        else:
            return 'no_group'

    train['shop_loc'] = train['shop_id'].map(make_shop_loc)
    test['shop_loc'] = test['shop_id'].map(make_shop_loc)
    
    
    # Adding flag if shop could not be meaningfully grouped
    train['no_loc_group'] = (train['shop_loc'] == 'no_group')
    test['no_loc_group'] = (test['shop_loc'] == 'no_group')
    
    
    # adding seasons
    def make_season(x):
        if x in [6, 7, 8]:
            return 'summer'
        elif x in [9, 10, 11]:
            return 'autumn'
        elif x in [3, 4, 5]:
            return 'spring'
        else:
            return 'winter'

    train['season'] = train['month'].map(make_season)
    test['season'] = test['month'].map(make_season)
    
    
    # seasonal sales for (shop_id, item_id)
    seasonal_sales = train.groupby(['shop_id', 'item_id', 'season'])['item_cnt_day'].sum()

    def make_seasonal_sales(x):
        if (x['shop_id'], x['item_id'], x['season']) in seasonal_sales.index:
            return seasonal_sales[(x['shop_id'], x['item_id'], x['season'])] / 9
        else:
            return 0

    train['seasonal_sales_shop_id'] = train.apply(make_seasonal_sales, axis='columns')
    test['seasonal_sales_shop_id'] = test.apply(make_seasonal_sales, axis='columns')
    
    
    # add prev month sales for (shop_loc, item_id)
    loc_item_db = train.groupby(['shop_loc', 'item_id', 'date_block_num'])['item_cnt_day'].mean()

    def make_prev_month_sales_loc(x):
        if x['shop_loc'] == 'no_group':
            return x['prev_month_sales']
        elif (x['shop_loc'], x['item_id'], x['date_block_num'] - 1) in loc_item_db.index:
            num = x['shop_loc'].split()
            num = int(num[1])
            return loc_item_db[(x['shop_loc'], x['item_id'], x['date_block_num'] - 1)] / num
        else:
            return 0

    train['prev_month_sales_loc'] = train.apply(make_prev_month_sales_loc, axis='columns')
    test['prev_month_sales_loc'] = test.apply(make_prev_month_sales_loc, axis='columns')
    
    
    # add seasonaladd_features() sales for (shop_loc, item_id)
    loc_item_seasonal = train.groupby(['shop_loc', 'item_id', 'season'])['item_cnt_day'].sum()

    def make_loc_item_seasonal(x):
        if x['shop_loc'] == 'no_group':
            return x['seasonal_sales_shop_id']
        if (x['shop_loc'], x['item_id'], x['season']) in loc_item_seasonal.index:
            num = x['shop_loc'].split()
            num = int(num[1])
            return loc_item_seasonal[(x['shop_loc'], x['item_id'], x['season'])] / num
        else:
            return 0

    train['loc_item_seasonal'] = train.apply(make_prev_month_sales, axis='columns')
    test['loc_item_seasonal'] = test.apply(make_prev_month_sales, axis='columns')
    
    # add pair (shop_id, item_id)
    train['shop_id_item_id'] = train['shop_id'].astype(str) + '->' + train['item_id'].astype(str)
    test['shop_id_item_id'] = test['shop_id'].astype(str) + '->' + test['item_id'].astype(str)

In [5]:
%%time
add_features(train, test)

CPU times: user 11min 57s, sys: 4.35 s, total: 12min 2s
Wall time: 12min 2s


In [12]:
train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,month,year,new_item,prev_month_sales,cat_id,shop_loc,no_loc_group,season,seasonal_sales_shop_id,prev_month_sales_loc,loc_item_seasonal,shop_id_item_id
0,0,30,1,20.0,2,2013,True,0.0,40,Yakutsk 4,False,winter,2.222222,0.0,0.0,0->30
1,0,31,1,11.0,2,2013,True,0.0,37,Yakutsk 4,False,winter,1.222222,0.0,0.0,0->31
3,0,32,1,10.0,2,2013,True,0.0,40,Yakutsk 4,False,winter,1.111111,0.0,0.0,0->32
5,0,33,1,3.0,2,2013,True,0.0,37,Yakutsk 4,False,winter,0.333333,0.0,0.0,0->33
7,0,35,1,14.0,2,2013,True,0.0,40,Yakutsk 4,False,winter,1.555556,0.0,0.0,0->35


In [7]:
test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,month,year,new_item,prev_month_sales,cat_id,shop_loc,no_loc_group,season,seasonal_sales_shop_id,prev_month_sales_loc,loc_item_seasonal
0,0,5,5037,34,11,2015,False,0.0,19,no_group,True,autumn,0.333333,0.0,0.0
1,1,5,5320,34,11,2015,True,0.0,55,no_group,True,autumn,0.0,0.0,0.0
2,2,5,5233,34,11,2015,False,1.0,19,no_group,True,autumn,0.444444,1.0,1.0
3,3,5,5232,34,11,2015,False,0.0,23,no_group,True,autumn,0.0,0.0,0.0
4,4,5,5268,34,11,2015,True,0.0,20,no_group,True,autumn,0.0,0.0,0.0
