In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tqdm
sns.set()

In [2]:
def change_type(df, col, type_):
    df[col] = df[col].astype(type_)
    return df

In [3]:
train_data = pd.read_csv('train_data.csv')

In [4]:
train_data.shape

(27956445, 17)

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27956445 entries, 0 to 27956444
Data columns (total 17 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   ID                          object 
 1   geoCluster                  int64  
 2   SKU                         int64  
 3   date                        object 
 4   sales                       float64
 5   month                       int64  
 6   day                         int64  
 7   weekday                     int64  
 8   week_num                    int64  
 9   lagerUnitType_caption       object 
 10  commodity_group             int64  
 11  productCategory_caption_RU  object 
 12  cityId                      int64  
 13  price                       float64
 14  lagerUnitQuantity           float64
 15  lagerUnitTypeId             int64  
 16  revanue                     float64
dtypes: float64(4), int64(9), object(4)
memory usage: 3.5+ GB


In [6]:
train_data.commodity_group.max()

198

In [7]:
train_data.cityId.max()

25

In [8]:
train_data.lagerUnitTypeId.max()

6

In [9]:
train_data.SKU.max()

873803

In [10]:
train_data.geoCluster.max()

3230

In [11]:
train_data = change_type(train_data, 'geoCluster', 'uint16')
train_data = change_type(train_data, 'SKU', 'uint32')
train_data = change_type(train_data, 'commodity_group', 'uint8')
train_data = change_type(train_data, 'cityId', 'uint8')
train_data = change_type(train_data, 'lagerUnitTypeId', 'uint8')

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27956445 entries, 0 to 27956444
Data columns (total 17 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   ID                          object 
 1   geoCluster                  uint16 
 2   SKU                         uint32 
 3   date                        object 
 4   sales                       float64
 5   month                       int64  
 6   day                         int64  
 7   weekday                     int64  
 8   week_num                    int64  
 9   lagerUnitType_caption       object 
 10  commodity_group             uint8  
 11  productCategory_caption_RU  object 
 12  cityId                      uint8  
 13  price                       float64
 14  lagerUnitQuantity           float64
 15  lagerUnitTypeId             uint8  
 16  revanue                     float64
dtypes: float64(4), int64(4), object(4), uint16(1), uint32(1), uint8(3)
memory usage: 2.7+ GB


In [13]:
sku_meta = pd.read_csv('../sku_final.csv')

In [14]:
geo_params = pd.read_csv('../geo_params.csv')

In [15]:
test_data = pd.read_csv('../test_data.csv')

In [16]:
test_data.head()

Unnamed: 0,ID,geoCluster,SKU,date,price
0,RR1666030,21,32485,2021-07-20,66.69
1,RR1666031,21,32485,2021-07-21,66.69
2,RR1666032,21,32485,2021-07-22,66.69
3,RR1666033,21,32485,2021-07-23,66.69
4,RR1666034,21,32485,2021-07-24,66.69


In [17]:
train_data = train_data.sort_values(by=['week_num'])

In [18]:
train_data['week_num'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53])

In [19]:
train_data[train_data['week_num'].isin([51,53])].sales.mean()

0.24557248818772717

In [20]:
test_data['geoCluster'].value_counts()

2043    12950
2022    12558
2049    12516
2735    11984
1935    11648
        ...  
2953       14
607        14
3095       14
149        14
287        14
Name: geoCluster, Length: 515, dtype: int64

In [21]:
train_data = train_data.sort_values(by=['geoCluster', 'SKU','date'])

In [22]:
test_data = test_data.merge(sku_meta, on='SKU', how='inner')
test_data = test_data.merge(geo_params, on=['geoCluster'], how='inner')

In [23]:
test_data.shape

(1666028, 25)

In [24]:
test_data.head()

Unnamed: 0,ID,geoCluster,SKU,date,price,productCategoryId,productCategory_caption_UKR,productCategory_caption_RU,productCategory_caption_ENG,productTypeId,...,lagerUnitTypeId,lagerUnitType_caption,trademark,countryOfOrigin,countryOfOrigin_caption,commodity_group,commodity_group_caption_UKR,commodity_group_caption_RU,commodity_group_caption_ENG,cityId
0,RR1666030,21,32485,2021-07-20,66.69,5381.0,Банан,Банан,Banana,4752.0,...,1,г,,,,5551018,Фрукти тропічні,Фрукты тропические,Tropical fruits,1
1,RR1666031,21,32485,2021-07-21,66.69,5381.0,Банан,Банан,Banana,4752.0,...,1,г,,,,5551018,Фрукти тропічні,Фрукты тропические,Tropical fruits,1
2,RR1666032,21,32485,2021-07-22,66.69,5381.0,Банан,Банан,Banana,4752.0,...,1,г,,,,5551018,Фрукти тропічні,Фрукты тропические,Tropical fruits,1
3,RR1666033,21,32485,2021-07-23,66.69,5381.0,Банан,Банан,Banana,4752.0,...,1,г,,,,5551018,Фрукти тропічні,Фрукты тропические,Tropical fruits,1
4,RR1666034,21,32485,2021-07-24,66.69,5381.0,Банан,Банан,Banana,4752.0,...,1,г,,,,5551018,Фрукти тропічні,Фрукты тропические,Tropical fruits,1


In [25]:
test_cols, train_cols = test_data.columns, train_data.columns
cols_to_use = set(train_cols).intersection(test_cols)

In [26]:
test_data = test_data[cols_to_use]

In [27]:
test_data.head()

Unnamed: 0,SKU,geoCluster,ID,productCategory_caption_RU,lagerUnitTypeId,commodity_group,price,lagerUnitQuantity,lagerUnitType_caption,date,cityId
0,32485,21,RR1666030,Банан,1,5551018,66.69,1.0,г,2021-07-20,1
1,32485,21,RR1666031,Банан,1,5551018,66.69,1.0,г,2021-07-21,1
2,32485,21,RR1666032,Банан,1,5551018,66.69,1.0,г,2021-07-22,1
3,32485,21,RR1666033,Банан,1,5551018,66.69,1.0,г,2021-07-23,1
4,32485,21,RR1666034,Банан,1,5551018,66.69,1.0,г,2021-07-24,1


In [28]:
full_data = pd.concat([train_data, test_data],axis=0)

In [29]:
full_data.shape

(29622473, 17)

In [30]:
import gc

In [31]:
dates_test = test_data['date'].unique()

In [32]:
full_data[full_data['date'].isin(dates_test)].shape

(1666028, 17)

In [33]:
test_data.shape

(1666028, 11)

In [34]:
full_data.tail()

Unnamed: 0,ID,geoCluster,SKU,date,sales,month,day,weekday,week_num,lagerUnitType_caption,commodity_group,productCategory_caption_RU,cityId,price,lagerUnitQuantity,lagerUnitTypeId,revanue
1666023,RR3305649,3095,642700,2021-07-29,,,,,,кг,5550352,Сыры полутвердые крупнопористые,0,113.59,1.0,2,
1666024,RR3305650,3095,642700,2021-07-30,,,,,,кг,5550352,Сыры полутвердые крупнопористые,0,113.59,1.0,2,
1666025,RR3305651,3095,642700,2021-07-31,,,,,,кг,5550352,Сыры полутвердые крупнопористые,0,113.59,1.0,2,
1666026,RR3305652,3095,642700,2021-08-01,,,,,,кг,5550352,Сыры полутвердые крупнопористые,0,113.59,1.0,2,
1666027,RR3305653,3095,642700,2021-08-02,,,,,,кг,5550352,Сыры полутвердые крупнопористые,0,113.59,1.0,2,


### cleaning memory

In [35]:
full_data = change_type(full_data, 'geoCluster', 'uint16')
full_data = change_type(full_data, 'SKU', 'uint32')
full_data = change_type(full_data, 'commodity_group', 'uint8')
full_data = change_type(full_data, 'cityId', 'uint8')
full_data = change_type(full_data, 'lagerUnitTypeId', 'uint8')

In [36]:
del train_data, sku_meta, geo_params;
gc.collect();

## creation of features

In [37]:
full_data.drop(columns=['lagerUnitType_caption'],inplace=True)

### days on the market 

In [38]:
full_data['date'] = pd.to_datetime(full_data['date'])

In [39]:
unique_SKU_list = full_data.SKU.unique()
for sku in tqdm.tqdm(unique_SKU_list):
    min_date = full_data[full_data['SKU'] == sku]['date'].min()
    #print(min_date)
    full_data.loc[full_data['SKU'] == sku , 'days_on_the_market'] = full_data.loc[full_data['SKU'] == sku,'date'] - min_date
    #print(train_data[train_data['SKU'] == sku]['date'] - min_date)

100%|██████████| 1961/1961 [05:39<00:00,  5.77it/s]


In [40]:
full_data['days_on_the_market']  = full_data['days_on_the_market'].dt.days

In [41]:
full_data['days_on_the_market'].mean()

271.0507787786658

In [42]:
# groups = full_data[['date','geoCluster','SKU']].groupby(['geoCluster','SKU'], as_index=False)

In [43]:
# days_on_market_geo = pd.DataFrame()
# for key in tqdm.tqdm(groups.groups.keys()):
#     group = groups.get_group(key)
#     group['date'] = group['date']-group['date'].min()
#     days_on_market_geo = days_on_market_geo.append(group)

### max sales, min sales, max ravenue, min ravenue

In [44]:
def agg_feature(df, dates_to_exclude, to_groupby, to_agg, func='min'):
    if not isinstance(to_groupby, list):
        to_groupby = [to_groupby]
    result = df[~df['date'].isin(dates_to_exclude)].groupby(to_groupby,as_index=False).agg({to_agg:func})
    result = result.rename(columns={to_agg:to_agg+'_grouped_{}_{}'.format('_'.join(to_groupby),func)})
    df = df.merge(result, on=to_groupby, how='left')
    return df

In [None]:
full_data['date'] = pd.to_datetime(full_data['date'])
full_data['weekday'] = full_data['date'].dt.weekday
full_data['day'] = full_data['date'].dt.day
full_data['year'] = full_data['date'].dt.year
full_data['month'] = full_data['date'].dt.month
max_week = full_data[full_data['year']==2020]['week_num'].max()
full_data['week_num'] = full_data['date'].dt.week
full_data.loc[full_data['year']==2021, 'week_num'] = full_data.loc[full_data['year']==2021, 'week_num'].apply(lambda x: x+max_week)
full_data = change_type(full_data, 'month', 'uint8')
full_data = change_type(full_data, 'day', 'uint8')
full_data = change_type(full_data, 'weekday', 'uint8')
full_data = change_type(full_data, 'week_num', 'uint8')

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','geoCluster'], 'sales', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','geoCluster'], 'sales', 'sum')

In [None]:
full_data[full_data['date'].isin(dates_test)].shape

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','cityId'], 'sales', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','commodity_group'], 'sales', 'mean')

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','geoCluster'], 'revanue', 'sum')
full_data = agg_feature(full_data, dates_test, ['week_num','geoCluster'], 'revanue', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','cityId'], 'revanue', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','commodity_group'], 'revanue', 'mean')

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','SKU'], 'revanue', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU'], 'sales', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU'], 'revanue', 'sum')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU'], 'sales', 'sum')

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','commodity_group'], 'revanue', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','commodity_group'], 'sales', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','commodity_group'], 'revanue', 'sum')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','commodity_group'], 'sales', 'sum')

In [None]:
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','geoCluster'], 'revanue', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','geoCluster'], 'sales', 'mean')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','geoCluster'], 'revanue', 'sum')
full_data = agg_feature(full_data, dates_test, ['week_num','SKU','geoCluster'], 'sales', 'sum')

In [None]:
full_data.week_num.unique()

In [None]:
full_data[~full_data['date'].isin(dates_test)].isna().sum()

In [None]:
full_data.tail()

In [None]:
full_data.columns

### lag on aggregated features

In [None]:
full_data.info()

In [None]:
for i in full_data.columns:
    if 'float' in str(full_data[i].dtype):
            full_data[i] = full_data[i].astype('float16')

In [None]:
def lag_feature(df, lags, agg_columns, col):
    agg_columns = agg_columns+[col]
    tmp = df[agg_columns]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = agg_columns[:-1]+[col+'_lag_'+str(i)]
        shifted['week_num'] += i
        shifted = shifted.drop_duplicates(agg_columns[:-1])
        df = pd.merge(df, shifted, on=agg_columns[:-1], how='left')
        df[col+'_lag_'+str(i)] = df[col+'_lag_'+str(i)].fillna(0)
    df = df.drop(columns=col)
    return df

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','commodity_group'], 'revanue_grouped_week_num_commodity_group_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','cityId'], 'revanue_grouped_week_num_cityId_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','geoCluster'], 'revanue_grouped_week_num_geoCluster_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','geoCluster'], 'revanue_grouped_week_num_geoCluster_sum')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','commodity_group'], 'sales_grouped_week_num_commodity_group_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','cityId'], 'sales_grouped_week_num_cityId_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','geoCluster'], 'sales_grouped_week_num_geoCluster_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','geoCluster'], 'sales_grouped_week_num_geoCluster_sum')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU'], 'sales_grouped_week_num_SKU_mean')
full_data = lag_feature(full_data, [2,3], ['week_num','SKU'], 'sales_grouped_week_num_SKU_sum')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU'], 'revanue_grouped_week_num_SKU_sum')
full_data = lag_feature(full_data, [2,3], ['week_num','SKU'], 'revanue_grouped_week_num_SKU_mean')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','commodity_group'], 'revanue_grouped_week_num_SKU_commodity_group_sum')
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','commodity_group'], 'revanue_grouped_week_num_SKU_commodity_group_mean')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','geoCluster'], 'revanue_grouped_week_num_SKU_geoCluster_sum')
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','geoCluster'], 'revanue_grouped_week_num_SKU_geoCluster_mean')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','commodity_group'], 'sales_grouped_week_num_SKU_commodity_group_sum')


In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','commodity_group'], 'sales_grouped_week_num_SKU_commodity_group_mean')

In [None]:
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','geoCluster'], 'sales_grouped_week_num_SKU_geoCluster_sum')
full_data = lag_feature(full_data, [2,3], ['week_num','SKU','geoCluster'], 'sales_grouped_week_num_SKU_geoCluster_mean')

In [None]:
full_data.info()

### tf idf features

In [None]:
sku_meta = pd.read_csv('../sku_final.csv')

In [None]:
sku_meta['productCategory_caption_RU'] = sku_meta['productCategory_caption_RU'].str.lower()

In [None]:
sku_meta['SKU'].max()


In [None]:
full_data['SKU'].max()

In [None]:
sku_meta['SKU'].max()

In [None]:
sku_meta['SKU'].max()

In [None]:
len(set(sku_meta['SKU'].unique()).difference(full_data['SKU'].unique()))

In [None]:
description = np.hstack(sku_meta['productCategory_caption_RU'].apply(lambda x: x.split(' ')))
unique_tokens_desc, frequency_desc = np.unique(description, return_counts=True)

In [None]:
tokens = sorted(list(zip(unique_tokens_desc, frequency_desc)),key=lambda x: x[1])[::-1]

In [None]:
top_tokens = [i[0] for i in tokens[1:21]]

In [None]:
top_tokens

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
tfidf_result = vectorizer.fit_transform(sku_meta['productCategory_caption_RU'])

In [None]:
tfidf_result = pd.DataFrame(tfidf_result.todense(), columns=vectorizer.get_feature_names())

In [None]:
meta = pd.concat([sku_meta[['SKU']], tfidf_result[top_tokens]], axis=1)

In [None]:
meta.head()

In [None]:
meta.shape

In [None]:
for i in meta.columns:
    if 'float' in str(meta[i].dtype):
            meta[i] = meta[i].astype('float16')

In [None]:
len(set(full_data['SKU'].unique()).intersection(meta['SKU']))

In [None]:
full_data = full_data.merge(meta, on='SKU', how='left')

In [None]:
full_data.info()

### holidays feture

In [None]:
!pip install holidays

In [None]:
import datetime

In [None]:
import holidays
ukr_holidays = holidays.Ukraine()
ukr_holidays_2020_2021 = {}
for date, name in sorted(holidays.Ukraine(years=2020).items()):
    print(date, name)
    ukr_holidays_2020_2021[date] = name
for date, name in sorted(holidays.Ukraine(years=2021).items()):
    print(date, name)
    ukr_holidays_2020_2021[date] = name

ukr_holidays_2020_2021[datetime.datetime(2020, 11, 23)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 24)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 25)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 26)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 27)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 28)] = 'Black Friday'
ukr_holidays_2020_2021[datetime.datetime(2020, 11, 29)] = 'Black Friday'
ukr_holidays_2020_2021_df = pd.DataFrame.from_dict(data = ukr_holidays_2020_2021.items())
ukr_holidays_2020_2021_df.columns = ['date','holliday_name']
ukr_holidays_2020_2021_df['date'] = pd.to_datetime(ukr_holidays_2020_2021_df['date'])
ukr_holidays_2020_2021_df['is_holiday'] = 1

In [None]:
ukr_holidays_2020_2021_df.head()

In [None]:
full_data = pd.merge(full_data, ukr_holidays_2020_2021_df[['date','is_holiday']], on = 'date', how='left')
full_data['is_holiday'].fillna(0, inplace = True)

In [None]:
full_data.shape[0]==29622473

In [None]:
full_data = change_type(full_data, 'is_holiday','uint8')

### lockdown feature

In [None]:
lock_down = pd.read_csv('lockdown.csv')

In [None]:
lock_down.head()

In [None]:
lock_down.shape

In [None]:
(lock_down==1).sum()

In [None]:
lock_down['date'] = pd.to_datetime(lock_down['date'])

In [None]:
full_data = full_data.merge(lock_down, on='date', how='left')
full_data['lockdown'].fillna(0, inplace=True)

In [None]:
full_data = change_type(full_data, 'lockdown','uint8')

### Leaving only features of interest

In [None]:
full_data.columns

In [None]:
full_data = full_data.drop(columns=['revanue'])

### train-test-submission split

In [None]:
full_data.isna().sum()

In [None]:
sub_data = full_data[full_data['date'].isin(dates_test)]
train_data = full_data[~full_data['date'].isin(dates_test)]

In [None]:
del full_data;
gc.collect();

In [None]:
import datetime

In [None]:
max_date = train_data['date'].max()
min_date = max_date-datetime.timedelta(days=14)

In [None]:
dates_val = pd.date_range(min_date, max_date)

In [None]:
val_data = train_data[train_data['date'].isin(dates_val)]

In [None]:
val_data.shape

In [None]:
train_data = train_data[~train_data['date'].isin(dates_val)]

# Training 

In [None]:
!pip install lightgbm

In [None]:
import lightgbm as lgb

In [None]:
train_data.columns

In [None]:
train_X = train_data.drop(columns=['sales','ID','date','week_num','productCategory_caption_RU','year'])

In [None]:
dict_mapping = {'собств':'sobstv', 'производства':'proizvodstva',
'сыры':'siri', 'начинкой':'na4inkoi', 'вода':'voda', 'произв':'proizv', 
'собственного':'sobstvennogo', 'йогурты' : 'yogurty',
'полутвердые':'polytverdie', 'украины':'ukraini', 'без':'bez', 'молочные':'molochnie',
'сдоба':'sdoba', 'хлеб':'hleb', 'допек':'dopek',
'десерты':'deserti', 'добавками':'dobavkami', 'газированная':'gzirovanaya', 
'добавок':'dobavok', 'импортная':'importnaya'}

In [None]:
train_X = train_X.rename(columns=dict_mapping)

In [None]:
train_y = train_data['sales']

In [None]:
val_X = val_data.drop(columns=['sales','ID','date','week_num','productCategory_caption_RU','year'])

In [None]:
val_X = val_X.rename(columns=dict_mapping)

In [None]:
val_y = val_data['sales']

In [None]:
del sku_meta, test_data;
gc.collect();

In [None]:
# lgb_model = lgb.LGBMRegressor(feature_fraction= 0.75,
#                metric = 'mae',
#                max_depth = 16, 
#                min_data_in_leaf = 2**7, 
#                bagging_fraction = 0.75, 
#                learning_rate = 0.03, 
#                objective = 'mae', 
#                bagging_seed = 2**7, 
#                num_leaves = 200,
#                bagging_freq =1,
#                verbose = 1,
#             random_state=5,
#                              n_estimators=100)
# lgb_model.fit(train_X,train_y,eval_metric="mae", 
#     eval_set=[(val_X, val_y)], 
#     verbose=True, 
#     early_stopping_rounds = 10)

In [None]:
# from lightgbm import plot_importance


In [None]:
# plot_importance(lgb_model,ax=plt.subplots(1,1,figsize=(15,12))[1])

In [None]:
del lgb_model;
gc.collect();

In [None]:
full_data_X = pd.concat([train_X, val_X],axis=0)

In [None]:
full_data_X.shape

In [None]:
del train_X, val_X;
gc.collect();

In [None]:
full_data_y = pd.concat([train_y,val_y],axis=0)

In [None]:
del train_y, val_y;
gc.collect();

### final train

In [None]:
# gc.collect();

In [None]:
lgb_model = lgb.LGBMRegressor(feature_fraction= 0.75,
               metric = 'mae',
               max_depth = 16, 
               min_data_in_leaf = 2**7, 
               bagging_fraction = 0.75, 
               learning_rate = 0.03, 
               objective = 'mae', 
               bagging_seed = 2**7, 
               num_leaves = 200,
               bagging_freq =1,
               verbose = 1,
            random_state=5,
                             n_estimators=100)
lgb_model.fit(full_data_X,full_data_y,eval_metric="mae", 
    eval_set=[(full_data_X, full_data_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

### prediction for submission

In [None]:
sub_X = sub_data.drop(columns=['sales','ID','date','week_num','productCategory_caption_RU','year'])

In [None]:
sub_X = sub_X.rename(columns=dict_mapping)

In [None]:
prediction = lgb_model.predict(sub_X)

In [None]:
np.mean(prediction)

In [None]:
len(prediction)

In [None]:
sub_data['sales'] = prediction

In [None]:
sub_data.loc[sub_data['sales']<0,'sales']= 0

In [None]:
sub_data['sales'].mean()

In [None]:
sub_data = sub_data[['ID','sales']]

In [None]:
sub_data.to_csv('submission_6.csv',index=False)