# Settings

In [1]:
!pip install category_encoders



# Load data

In [55]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


In [56]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-10.0.1-cp38-cp38-win_amd64.whl (20.3 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-10.0.1


In [25]:
PREDICT_MONTH = 34

In [37]:
data = pd.read_csv('./competitive-data-science-predict-future-sales/transformed_data/work_df_after_eda.csv')

In [38]:
data 

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_price,shop_name,city,item_name,item_category_id,item_category_name,cat,sub_cat,ln_price,month
0,0,0,0,0,58.0,"Якутск Орджоникидзе, 56 фран",Якутск,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,Кино - DVD,Кино,DVD,4.110874,1
1,0,0,1,0,58.0,"Якутск Орджоникидзе, 56 фран",Якутск,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,Кино - DVD,Кино,DVD,4.110874,2
2,0,0,2,0,58.0,"Якутск Орджоникидзе, 56 фран",Якутск,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,Кино - DVD,Кино,DVD,4.110874,3
3,0,0,3,0,58.0,"Якутск Орджоникидзе, 56 фран",Якутск,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,Кино - DVD,Кино,DVD,4.110874,4
4,0,0,4,0,58.0,"Якутск Орджоникидзе, 56 фран",Якутск,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,Кино - DVD,Кино,DVD,4.110874,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45226795,22099,59,29,0,23.0,"Ярославль ТЦ ""Альтаир""",Ярославль,Элемент питания СТАРТ ZT 15A (LR6) - Блистер 2 шт,83,Элементы питания,Элементы питания,Элементы питания,3.258097,6
45226796,22099,59,30,0,23.0,"Ярославль ТЦ ""Альтаир""",Ярославль,Элемент питания СТАРТ ZT 15A (LR6) - Блистер 2 шт,83,Элементы питания,Элементы питания,Элементы питания,3.258097,7
45226797,22099,59,31,0,23.0,"Ярославль ТЦ ""Альтаир""",Ярославль,Элемент питания СТАРТ ZT 15A (LR6) - Блистер 2 шт,83,Элементы питания,Элементы питания,Элементы питания,3.258097,8
45226798,22099,59,32,0,23.0,"Ярославль ТЦ ""Альтаир""",Ярославль,Элемент питания СТАРТ ZT 15A (LR6) - Блистер 2 шт,83,Элементы питания,Элементы питания,Элементы питания,3.258097,9


# Rolling features

In [39]:
from category_encoders import TargetEncoder

In [40]:
class RollingFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for agg_name, window, func in self.params:
            col_name = agg_name
            if agg_name.endswith('_id'):
                col_name = agg_name[:-3]
            func_name = func
            if func == 'mean':
                func_name = 'avg'
            col_name = f'rol_{func_name}_{col_name}{window}'
            sample = X.groupby([agg_name, 'date_block_num'])[['item_cnt_month']].sum().reset_index([0,1])
            sample[col_name] = sample.groupby(agg_name)['item_cnt_month'].shift(1)
            sample[col_name] = sample.groupby(agg_name)[col_name].rolling(window).agg({col_name: func}).reset_index(0, drop=True)
            sample[col_name] = sample[col_name].fillna(0)
            X = merge(X, 
                      sample[[agg_name, 'date_block_num', col_name]], 
                      on=[agg_name, 'date_block_num'], 
                      how = 'left',
                      check_left=True)
            sample = 0
            if func in ['mean', 'median', 'std']:
                X[col_name] = X[col_name].astype(np.float32)
            else:
                X[col_name] = X[col_name].astype(np.int32)
        return X

In [41]:
class LagFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for window, target, fillna, new_type in self.params:
            col_name = f'lag_item_shop{window}'
            X[col_name] = X.groupby(['item_id', 'shop_id'])[target].shift(window).fillna(fillna)
            X[col_name] = X[col_name].astype(new_type)
        return X

In [42]:
class DifFeatureCreator():
    def __init__(self, params):
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for name, x, y in self.params:
            X[name] = X[x] - X[y]
        return X

In [43]:
class CatTargetEncoder():
    def __init__(self, months, params):
        self.months = months[1:]
        self.params = params

    def fit(self, X, y=None):
        return self

    def transform(self, data, y=None):
        for col, encoded_name, target in self.params:
            for month in self.months:
                print(month)
                encoder = TargetEncoder()
                encoder.fit(data[data.date_block_num < month][col].astype(str), data[data.date_block_num < month][target])
                res = encoder.transform(data[data.date_block_num == month][col].astype(str))
                data.loc[data.date_block_num == month, encoded_name] = res.iloc[:,0]
        return data

In [44]:
class CatLabelEncoder():
    def __init__(self, col_names):
        self.col_names = col_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for col in self.col_names:
            encoder = LabelEncoder()
            X[col] = encoder.fit_transform(X[col])
            X[col] = X[col].astype(np.int8)
        return X



In [45]:
rol_lst1 = [ 
    ('city', 2, 'mean'),
    ('city', 6, 'mean'),
    ('sub_cat', 2, 'sum'),
    ('sub_cat', 4, 'mean'),
    ('sub_cat', 4, 'std'),
    ('sub_cat', 6, 'mean'),
    ('item_id', 4, 'median'),
    ('cat', 6, 'mean'),
    ('cat', 2, 'sum'),
    ('cat', 4, 'mean'),
    ('cat', 4, 'std'),
    ('shop_id', 2, 'mean'),
    ('shop_id', 4, 'mean'),
    ('shop_id', 6, 'median')
    ]
rol_lst2 = [
    ('item_id', 12, 'mean'),
    ('shop_id', 12, 'mean'),
    ('cat', 12, 'mean'),
    ('sub_cat', 12, 'mean')
]
rol_lst3 = [
    ('item_id', 2, 'mean'),
    ('item_id', 4, 'max'),
    ('item_id', 6, 'mean'),
    ('item_id', 6, 'std'),
]
rol_lst4 = [
    ('sub_cat', 5, 'min'),
    ('cat', 5, 'min'),
    ('city', 6, 'std'),
    ('shop_id', 5, 'min'),
    ('sub_cat', 5, 'median'),
    ('cat', 5, 'median'),
    ('cat', 1, 'sum'),
    ('sub_cat', 1, 'sum'),
    ('shop_id', 6, 'std'),
    ('item_id', 1, 'sum'),
    ('shop_id', 1, 'sum'),
    ('item_id', 8, 'mean'),
    ('shop_id', 8, 'mean'),
    ('cat', 8, 'mean'),
    ('sub_cat', 8, 'mean')
]
lag_lst = [
    (1, 'item_cnt_month', 0, np.int16),
    (2, 'item_cnt_month', 0, np.int16),
    (3, 'item_cnt_month', 0, np.int16), 
    (4, 'item_cnt_month', 0, np.int16), 
    (12, 'item_cnt_month', 0, np.int16)]
dif_lst = [
    ('dif2_1', 'lag_item_shop2', 'lag_item_shop1'),
    ('dif4_1', 'lag_item_shop4', 'lag_item_shop1'),
    ('dif12_1', 'lag_item_shop12', 'lag_item_shop1'),
    ('difitem6_2', 'rol_avg_item6', 'rol_avg_item2'),
]
cat_label_lst = ['cat', 'sub_cat', 'city']
cat_target_lst = [
   
]
cat_target_lst1 = [
    ('city', 'city_target', 'item_cnt_month'),
    ('cat', 'cat_target', 'item_cnt_month'),
    ('sub_cat', 'sub_cat_target', 'item_cnt_month'),
    ('shop_id', 'shop_target', 'item_cnt_month'),
    ('item_id', 'item_target', 'item_cnt_month'),
]
cat_target_lst2 = [
    (['city', 'cat'], 'city_cat_target', 'item_cnt_month'),
    (['city', 'sub_cat'], 'city_subcat_target', 'item_cnt_month'),
    (['item_id','shop_id'], 'item_shop_target', 'item_cnt_month'),
]

In [46]:
feature_creating_pipeline = Pipeline(steps=[
                                            ('lag',LagFeatureCreator(params=lag_lst)),
                                            ('rol', RollingFeatureCreator(params=rol_lst3)),
                                            ('dif',DifFeatureCreator(params=dif_lst))])
feature_creating_pipeline.fit(data)
data = feature_creating_pipeline.transform(data)

# Other fet

In [47]:
data['month'] = data['date_block_num'] % 12 + 1

In [48]:
temp = data.groupby(['date_block_num'])[['item_cnt_month']].sum().shift(1).fillna(0).reset_index([0])
temp.columns = ['date_block_num', 'lag_month1']
data = merge(data, temp, on='date_block_num', how='left', check_left=True)

In [49]:
temp = data[data.item_cnt_month > 0].groupby(['city', 'date_block_num'])[['shop_id']].nunique()
temp.columns = ['shop_cnt']
temp = temp.reset_index([0,1])

In [50]:
data = merge(data, temp, on=['city', 'date_block_num'], how='left', check_left=True, fillna=0)

In [51]:
data.lag_month1 = data.lag_month1.astype(np.int32)
data.shop_cnt = data.shop_cnt.astype(np.int8)

In [52]:
data.sample()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,item_price,shop_name,city,item_name,item_category_id,item_category_name,...,rol_avg_item2,rol_max_item4,rol_avg_item6,rol_std_item6,dif2_1,dif4_1,dif12_1,difitem6_2,lag_month1,shop_cnt
7515993,17871,18,21,0,399.0,"Красноярск ТЦ ""Июнь""",Красноярск,РИО 2,40,Кино - DVD,...,152.5,344,108.166664,142.63858,1,0,0,-44.333336,99427,2


# Save to .parquet

In [57]:
test = data[data.date_block_num == PREDICT_MONTH]
test.to_parquet('./competitive-data-science-predict-future-sales/transformed_data/work_df_after_feature_engineering__test.parquet')


In [59]:
data[(data.date_block_num < PREDICT_MONTH)].to_parquet('./competitive-data-science-predict-future-sales/transformed_data/work_df_after_feature_engineering__train.parquet')

MemoryError: Unable to allocate 43.1 MiB for an array with shape (45226800,) and data type bool