# Short Notebook

In [308]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

import datetime as dt
sns.set_style("whitegrid")
sns.set_palette("colorblind")

# from matplotlib import rc
# rc('font',**{'family':'sans-serif'})
# rc('axes', **{'titlesize': '8', 'labelsize': '6'})
# rc('legend', **{'fontsize': '6'})
# rc('figure', **{'dpi' : 200})

In [309]:
# Reading datasets
beer_train = pd.read_csv('../beer_train.csv', parse_dates=['Date'])
beer_test = pd.read_csv('../beer_test.csv', parse_dates=['Date'])
sku_features = pd.read_csv('../sku_features.csv')
id_map = pd.read_csv('../id_store_sku.csv')


In [310]:
beer_train['Promo_help'] = (beer_train['isPromo']>0)*1
beer_test['Promo_help'] = (beer_test['isPromo']>0)*1

In [311]:
# This is a core part of the transition from a time-series prediction to a regression problem.
# The main idea is to featurize time in a way that it will keep the important information about the date.

def generate_features(df, sku_features, id_map):
        
    # Add metadata
    df = pd.merge(df, id_map, how='left', on='ts_id')
    df = pd.merge(df, sku_features, how='left', on='SKU')

    # Time features
    df['day_of_month'] = df['Date'].dt.day
    df['day_of_week'] = df['Date'].dt.weekday
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['week'] = df['Date'].dt.week
    
    # Enlarge promo features
    # Since we know that promo is important
    
    df['ts_promo'] = df['ts_id'].astype(str) + df['isPromo'].astype(str)
    df['store_promo'] = df['Store'].astype(str) + df['isPromo'].astype(str)
    df['segment_promo'] = df['Segment'].astype(str) + df['isPromo'].astype(str)
    df['brand_promo'] = df['Brand'].astype(str) + df['isPromo'].astype(str)
    df['sku_promo'] = df['SKU'].astype(str) + df['isPromo'].astype(str)

    
    return df

In [312]:
beer_train = generate_features(beer_train, sku_features, id_map)
beer_test = generate_features(beer_test, sku_features, id_map)

# Days since Promo

In [313]:
master_df = pd.DataFrame()
j = 0
for i in beer_train['ts_id'].unique():
    j = j+1
    df = beer_train[beer_train['ts_id']==i]
    df['days_since_promo'] = df['Date'] - df['Date'].where(df['Promo_help']==1).ffill()
    df['days_since_promo'] = df['days_since_promo'].fillna(df['days_since_promo'].mean())
    master_df =  pd.concat([master_df, df['days_since_promo']])
    if j%200==0:
        print(j)
beer_train = pd.concat([beer_train, master_df], axis=1)
beer_train['Days_since_promo'] = beer_train.iloc[:,-1].dt.components.days.astype(int)
beer_train = beer_train.drop(columns=beer_train.iloc[0:0,-2].name)
beer_train.head()

200
400
600
800
1000
1200
1400
1600
1800


Unnamed: 0,Date,ts_id,isPromo,Sales,Promo_help,Store,SKU,Segment,Pack,Product,...,day_of_week,month,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo
0,2015-06-15,0,0.0,0.1,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,0,6,2015,25,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",62
1,2015-06-15,1,0.0,0.35,0,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,0,6,2015,25,10.0,33010.0,MainStream0.0,Baltika №30.0,"_Baltika №3 Can 0,450.0",13
2,2015-06-15,2,1.0,0.6,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,0,6,2015,25,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0
3,2015-06-15,3,1.0,0.35,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,0,6,2015,25,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0
4,2015-06-15,4,0.0,0.15,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,0,6,2015,25,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",58


In [314]:
master_df = pd.DataFrame()
j = 0
for i in beer_test['ts_id'].unique():
    j = j+1
    df = beer_test[beer_test['ts_id']==i]
    df['days_since_promo'] = df['Date'] - df['Date'].where(df['Promo_help']==1).ffill()
    df['days_since_promo'] = df['days_since_promo'].fillna(df['days_since_promo'].mean())
    master_df =  pd.concat([master_df, df['days_since_promo']])
    if j%200==0:
        print(j)
beer_test = pd.concat([beer_test, master_df], axis=1)
beer_test.iloc[:,-1] = beer_test.iloc[:,-1].fillna(beer_test.iloc[:,-1].mean())

beer_test['Days_since_promo'] = beer_test.iloc[:,-1].dt.components.days.astype(int)
beer_test = beer_test.drop(columns=beer_test.iloc[0:0,-2].name)
beer_test.head()

200
400
600
800
1000
1200
1400
1600
1800


Unnamed: 0,id,Date,ts_id,isPromo,Promo_help,Store,SKU,Segment,Pack,Product,...,day_of_week,month,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo
0,0,2017-06-15,0,0.0,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,3,6,2017,24,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",16
1,1,2017-06-15,1,1.0,1,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,3,6,2017,24,11.0,33011.0,MainStream1.0,Baltika №31.0,"_Baltika №3 Can 0,451.0",0
2,2,2017-06-15,2,1.0,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,3,6,2017,24,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0
3,3,2017-06-15,3,1.0,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,3,6,2017,24,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0
4,4,2017-06-15,4,0.0,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,3,6,2017,24,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",44


# Promo in Store on other Beer

In [315]:
master_df = pd.DataFrame()
j = 0
for i in beer_train['Store'].unique():
    j = j+1
    df = beer_train[beer_train['Store']==i]
    for i in df['Date'].unique():
        if 1 in df.Promo_help:
            df['Promo_on_other_beer'] = 1 - df['Promo_help'] 
        else:
             df['Promo_on_other_beer'] = 0 
    if j%30==0:
        print(j)
    master_df =  pd.concat([master_df, df['Promo_on_other_beer']])
beer_train = pd.concat([beer_train, master_df], axis=1)
beer_train['Promo_on_other_beer'] = beer_train.iloc[:,-1]
beer_train = beer_train.drop(columns=beer_train.iloc[0:0,-2].name)
beer_train.head()

30
60
90
120
150
180
210
240


Unnamed: 0,Date,ts_id,isPromo,Sales,Promo_help,Store,SKU,Segment,Pack,Product,...,month,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer
0,2015-06-15,0,0.0,0.1,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,6,2015,25,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",62,1.0
1,2015-06-15,1,0.0,0.35,0,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,6,2015,25,10.0,33010.0,MainStream0.0,Baltika №30.0,"_Baltika №3 Can 0,450.0",13,1.0
2,2015-06-15,2,1.0,0.6,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,6,2015,25,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0
3,2015-06-15,3,1.0,0.35,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,6,2015,25,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0
4,2015-06-15,4,0.0,0.15,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,6,2015,25,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",58,1.0


In [316]:
master_df = pd.DataFrame()
j = 0
for i in beer_test['Store'].unique():
    j = j+1
    df = beer_test[beer_test['Store']==i]
    df['Promo_on_other_beer'] = np.NaN
    for i in df['Date'].unique():
        if 1 in df.Promo_help:
            df['Promo_on_other_beer'] = 1 - df['Promo_help'] 
        else:
             df['Promo_on_other_beer'] = 0 
    if j%30==0:
        print(j)
    master_df =  pd.concat([master_df, df['Promo_on_other_beer']])
beer_test = pd.concat([beer_test, master_df], axis=1)
beer_test['Promo_on_other_beer'] = beer_test.iloc[:,-1]
beer_test = beer_test.drop(columns=beer_test.iloc[0:0,-2].name)
beer_test.head()

30
60
90
120
150
180
210
240


Unnamed: 0,id,Date,ts_id,isPromo,Promo_help,Store,SKU,Segment,Pack,Product,...,month,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer
0,0,2017-06-15,0,0.0,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,6,2017,24,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",16,1.0
1,1,2017-06-15,1,1.0,1,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,6,2017,24,11.0,33011.0,MainStream1.0,Baltika №31.0,"_Baltika №3 Can 0,451.0",0,0.0
2,2,2017-06-15,2,1.0,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,6,2017,24,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0
3,3,2017-06-15,3,1.0,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,6,2017,24,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0
4,4,2017-06-15,4,0.0,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,6,2017,24,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",44,1.0


# Promo on same beer in other store

In [317]:
master_df = pd.DataFrame()
j = 0
for i in beer_train['SKU'].unique():
    j = j+1
    df = beer_train[beer_train['SKU']==i]
    df['promo_in_other_store'] = 0

    for k in df['Date'].unique():
        df.loc[df['Date'] == k,'promo_in_other_store'] = df.loc[df['Date']==k,'Promo_help'].sum()

    master_df =  pd.concat([master_df, df['promo_in_other_store']])

    print(j)
beer_train = pd.concat([beer_train, master_df], axis=1)
beer_train['promo_in_other_store'] = beer_train.iloc[:,-1]
beer_train = beer_train.drop(columns=beer_train.iloc[0:0,-2].name)
beer_train.head()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


Unnamed: 0,Date,ts_id,isPromo,Sales,Promo_help,Store,SKU,Segment,Pack,Product,...,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer,promo_in_other_store
0,2015-06-15,0,0.0,0.1,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,2015,25,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",62,1.0,0.0
1,2015-06-15,1,0.0,0.35,0,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,2015,25,10.0,33010.0,MainStream0.0,Baltika №30.0,"_Baltika №3 Can 0,450.0",13,1.0,0.0
2,2015-06-15,2,1.0,0.6,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,2015,25,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0,178.0
3,2015-06-15,3,1.0,0.35,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,2015,25,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0,137.0
4,2015-06-15,4,0.0,0.15,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,2015,25,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",58,1.0,0.0


In [318]:
master_df = pd.DataFrame()
j = 0
for i in beer_test['SKU'].unique():
    j = j+1
    df = beer_test[beer_test['SKU']==i]
    df['promo_in_other_store'] = 0
    for k in df['Date'].unique():
        df.loc[df['Date'] == k,'promo_in_other_store'] = df.loc[df['Date']==k,'Promo_help'].sum()

    master_df =  pd.concat([master_df, df['promo_in_other_store']])

    print(j)
beer_test = pd.concat([beer_test, master_df], axis=1)
beer_test['promo_in_other_store'] = beer_test.iloc[:,-1]
beer_test = beer_test.drop(columns=beer_test.iloc[0:0,-2].name)
beer_test.head()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


Unnamed: 0,id,Date,ts_id,isPromo,Promo_help,Store,SKU,Segment,Pack,Product,...,year,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer,promo_in_other_store
0,0,2017-06-15,0,0.0,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,2017,24,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",16,1.0,0.0
1,1,2017-06-15,1,1.0,1,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,2017,24,11.0,33011.0,MainStream1.0,Baltika №31.0,"_Baltika №3 Can 0,451.0",0,0.0,162.0
2,2,2017-06-15,2,1.0,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,2017,24,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0,248.0
3,3,2017-06-15,3,1.0,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,2017,24,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0,179.0
4,4,2017-06-15,4,0.0,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,2017,24,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",44,1.0,0.0


# Promo on beer in other stores

In [319]:
j = 0

beer_train['total_promo'] = 0

for k in beer_train['Date'].unique():
    beer_train.loc[beer_train['Date'] == k,'total_promo'] = beer_train.loc[beer_train['Date']==k,'Promo_help'].sum()
    j= j+1
    if j%100==0:
        print(j)

print(j)

beer_train.head()

100
200
300
400
500
600
700
731


Unnamed: 0,Date,ts_id,isPromo,Sales,Promo_help,Store,SKU,Segment,Pack,Product,...,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer,promo_in_other_store,total_promo
0,2015-06-15,0,0.0,0.1,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,25,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",62,1.0,0.0,506
1,2015-06-15,1,0.0,0.35,0,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,25,10.0,33010.0,MainStream0.0,Baltika №30.0,"_Baltika №3 Can 0,450.0",13,1.0,0.0,506
2,2015-06-15,2,1.0,0.6,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,25,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0,178.0,506
3,2015-06-15,3,1.0,0.35,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,25,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0,137.0,506
4,2015-06-15,4,0.0,0.15,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,25,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",58,1.0,0.0,506


In [320]:
j = 0

beer_test['total_promo'] = 0

for k in beer_test['Date'].unique():
    beer_test.loc[beer_test['Date'] == k,'total_promo'] = beer_test.loc[beer_test['Date']==k,'Promo_help'].sum()
    j= j+1
    if j%100==0:
        print(j)

print(j)

beer_test.head()

100
102


Unnamed: 0,id,Date,ts_id,isPromo,Promo_help,Store,SKU,Segment,Pack,Product,...,week,ts_promo,store_promo,segment_promo,brand_promo,sku_promo,Days_since_promo,Promo_on_other_beer,promo_in_other_store,total_promo
0,0,2017-06-15,0,0.0,0,3301,"_Baltika №0 Can 0,45",Premium,Can,Non-alco,...,24,0.0,33010.0,Premium0.0,Baltika №30.0,"_Baltika №0 Can 0,450.0",16,1.0,0.0,650
1,1,2017-06-15,1,1.0,1,3301,"_Baltika №3 Can 0,45",MainStream,Can,Lager,...,24,11.0,33011.0,MainStream1.0,Baltika №31.0,"_Baltika №3 Can 0,451.0",0,0.0,162.0,650
2,2,2017-06-15,2,1.0,1,3301,"_Baltika №7 Can 0,45",Premium,Can,Lager,...,24,21.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Can 0,451.0",0,0.0,248.0,650
3,3,2017-06-15,3,1.0,1,3301,"_Baltika №7 Bottle 0,47",Premium,Bottle,Lager,...,24,31.0,33011.0,Premium1.0,Baltika №71.0,"_Baltika №7 Bottle 0,471.0",0,0.0,179.0,650
4,4,2017-06-15,4,0.0,0,3301,"_Baltika №9 Can 0,45",MainStream,Can,Lager,...,24,40.0,33010.0,MainStream0.0,Baltika №90.0,"_Baltika №9 Can 0,450.0",44,1.0,0.0,650


# Holidays

In [321]:
holidays = pd.read_csv('holidays2.csv')

In [322]:
holidays.head()

Unnamed: 0,holidays
0,2015-01-01
1,2015-01-02
2,2015-01-03
3,2015-01-04
4,2015-01-05


In [323]:
beer_train['is_holiday'] = (beer_train['Date'].isin(np.array(holidays['holidays'])))*1
beer_test['is_holiday'] = (beer_test['Date'].isin(np.array(holidays['holidays'])))*1

In [324]:
beer_test['is_holiday'] = 0

beer_train['days_from_holiday'] = beer_train['Date'] - beer_train['Date'].where(beer_train['is_holiday']==1).ffill()
beer_test['days_from_holiday'] = beer_test['Date'] - np.datetime64('2017-06-12')

beer_train['days_from_holiday'] = beer_train['days_from_holiday'].fillna(beer_train['days_from_holiday'].mean())
beer_train['days_from_holiday'] = beer_train.iloc[:,-1].dt.components.days.astype(int)

beer_test['days_from_holiday'] = beer_test['days_from_holiday'].fillna(beer_test['days_from_holiday'].mean())
beer_test['days_from_holiday'] = beer_test.iloc[:,-1].dt.components.days.astype(int)

beer_train['days_to_holiday'] = beer_train['Date'].where(beer_train['is_holiday']==1).bfill() - beer_train['Date']
beer_test['days_to_holiday'] = np.datetime64('2017-11-02') - beer_test['Date']

beer_train['days_to_holiday'] = beer_train['days_to_holiday'].fillna(beer_train['days_to_holiday'].mean())
beer_train['days_to_holiday'] = beer_train.iloc[:,-1].dt.components.days.astype(int)

beer_test['days_to_holiday'] = beer_test['days_to_holiday'].fillna(beer_test['days_to_holiday'].mean())
beer_test['days_to_holiday'] = beer_test.iloc[:,-1].dt.components.days.astype(int)

In [325]:
org_data_train = beer_train.copy()
org_data_test = beer_test.copy()

# LightGBM

In [326]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
sns.set_style("whitegrid")
sns.set_palette("colorblind")
import lightgbm as lgb

# from matplotlib import rc
# rc('font',**{'family':'sans-serif'})
# rc('axes', **{'titlesize': '8', 'labelsize': '6'})
# rc('legend', **{'fontsize': '6'})
# rc('figure', **{'dpi' : 200})
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

In [327]:
beer_train = org_data_train
beer_test = org_data_test

In [328]:
beer_train['id'] = beer_train.index+1

beer_train_promo = beer_train[beer_train['isPromo']>0]
beer_test_promo = beer_test[beer_test['isPromo']>0]

beer_train = beer_train[beer_train['isPromo']==0]
beer_test = beer_test[beer_test['isPromo']==0]

id_tag2_promo = beer_train_promo['id']
id_tag2 = beer_train['id']

In [329]:
arr = ['Store','Segment', 'SKU', 'Pack', 'Product',
       'Brand', 'day_of_week', 'month','week'] #ts_id
drop = ['ts_promo', 'store_promo', 'segment_promo', 'brand_promo','is_holiday','Pack',
       'sku_promo','day_of_month','ts_id','Promo_help','isPromo','Promo_on_other_beer']

for c in arr:
    beer_train[c] = beer_train[c].astype('category')
    beer_test[c] = beer_test[c].astype('category')
    
beer_train = beer_train.drop(columns=drop)
beer_test = beer_test.drop(columns=drop)

In [330]:
arr = ['Store', 'SKU','day_of_week', 'month','ts_promo',
       'week', 'store_promo', 'brand_promo','sku_promo'] #ts_id
drop = ['day_of_month','Segment','Pack','Brand','segment_promo','Product','ts_id','Pack','is_holiday',
       'Promo_on_other_beer','Promo_help','Days_since_promo'] 
#'ts_id','Pack','is_holiday','Promo_on_other_beer','Promo_help','Days_since_promo' sold_last_week, week,ts_promo <-- with theese was previus best  
for c in arr:
    beer_train_promo[c] = beer_train_promo[c].astype('category')
    beer_test_promo[c] = beer_test_promo[c].astype('category')

beer_train_promo = beer_train_promo.drop(columns=drop)
beer_test_promo = beer_test_promo.drop(columns=drop)

In [331]:
train_y_promo = beer_train_promo['Sales']
train_y = beer_train['Sales']
del(beer_train_promo['Sales'], beer_train_promo['Date'],beer_train_promo['id'])
del(beer_train['Sales'], beer_train['Date'],beer_train['id'])
dates_test=beer_test[['Date','id']]
dates_test_promo = beer_test_promo[['Date','id']]
del(beer_test['Date'])
del(beer_test_promo['Date'])

In [332]:
id_tag = beer_test['id']
id_tag_promo = beer_test_promo['id']
del(beer_test['id'])
del(beer_test_promo['id'])

In [333]:
params= {'n_estimators': 5000, 'learning_rate': 0.01, 'num_leaves': 1980, 'max_depth': 10, 'min_child_samples': 1800, 'max_bin': 200, 'reg_alpha': 95, 'reg_lambda': 5, 
         'min_split_gain': 4.908999802253177, 'subsample': 0.7000000000000001, 'subsample_freq': 1, 'colsample_bytree': 0.95}
master_df = pd.DataFrame(columns=['id', 'Sales'])
model = lgb.LGBMRegressor(**params,random_state=42)
model.fit(beer_train,train_y)
prediction = model.predict(beer_test)
dataset = pd.DataFrame({'id': id_tag, 'Sales': prediction}, columns=['id', 'Sales'])
master_df =  pd.concat([master_df, dataset], ignore_index=True)


In [334]:
params = {'n_estimators': 850, 'learning_rate': 0.008, 'num_leaves': 560, 'max_depth': 9, 'min_child_samples': 200, 'max_bin': 285, 'reg_alpha': 30, 
          'reg_lambda': 70, 'min_split_gain': 2.7956030162894496, 'subsample': 0.45, 'subsample_freq': 1, 'colsample_bytree': 0.45}
model_promo = lgb.LGBMRegressor(**params,random_state=42)
model_promo.fit(beer_train_promo,train_y_promo)

prediction = model_promo.predict(beer_test_promo)
dataset = pd.DataFrame({'id': id_tag_promo, 'Sales': prediction}, columns=['id', 'Sales'])
master_df =  pd.concat([master_df, dataset], ignore_index=True)


In [335]:
dates_id = pd.concat([dates_test,dates_test_promo])

dates_id.shape

master_df = pd.merge(master_df,dates_id,on='id')

master_df.shape

master_df['Date'] = master_df['Date'].astype(str)

master_df['Date'].dtype

d1 = np.datetime64('2017-06-17')
arr = [d1.astype(str)]
for i in range(1,16):
    day = d1+np.timedelta64(i,'D')
    arr.append(day.astype(str))

master_df.loc[master_df['Date'].isin(arr),'Sales'] = master_df.loc[master_df['Date'].isin(arr),'Sales']*1.23

master_df = master_df.drop(columns=['Date'])

predictions_lgbm = master_df

# CatBoost

In [340]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
sns.set_style("whitegrid")
sns.set_palette("colorblind")

# from matplotlib import rc
# rc('font',**{'family':'sans-serif'})
# rc('axes', **{'titlesize': '8', 'labelsize': '6'})
# rc('legend', **{'fontsize': '6'})
# rc('figure', **{'dpi' : 200})
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer


from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
import optuna

from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.metrics import mean_squared_error

In [344]:
beer_train = org_data_train
beer_test = org_data_test

In [345]:
beer_train.loc[beer_train['Sales']<0,'Sales'] = 0

In [346]:
beer_train_promo = beer_train[beer_train['isPromo']>0]
beer_test_promo = beer_test[beer_test['isPromo']>0]

beer_train = beer_train[beer_train['isPromo']==0]
beer_test = beer_test[beer_test['isPromo']==0]



arr = ['Store','Segment', 'SKU', 'Pack', 'Product',
       'Brand', 'day_of_week', 'month','week']
drop = ['ts_promo', 'store_promo', 'segment_promo', 'brand_promo','total_promo','days_from_holiday',
       'sku_promo','day_of_month','ts_id', 'ts_promo','isPromo','Promo_help','Promo_on_other_beer','days_to_holiday']
for c in arr:
    beer_train[c] = beer_train[c].astype('category')
    beer_test[c] = beer_test[c].astype('category')
    
beer_train = beer_train.drop(columns=drop)
beer_test = beer_test.drop(columns=drop)

arr = [ 'Store', 'SKU','day_of_week', 'month',
       'week', 'store_promo', 'brand_promo','sku_promo']
drop = ['day_of_month','Segment','Pack','Brand','segment_promo','Product','ts_id', 'ts_promo','Promo_help','days_to_holiday',
        'Days_since_promo','Promo_on_other_beer','total_promo','days_from_holiday']
for c in arr:
    beer_train_promo[c] = beer_train_promo[c].astype('category')
    beer_test_promo[c] = beer_test_promo[c].astype('category')

beer_train_promo = beer_train_promo.drop(columns=drop)
beer_test_promo = beer_test_promo.drop(columns=drop)

In [347]:
train_y_promo = beer_train_promo['Sales']
train_y = beer_train['Sales']
del(beer_train_promo['Sales'], beer_train_promo['Date'])
del(beer_train['Sales'], beer_train['Date'])
dates_test=beer_test[['Date','id']]
dates_test_promo = beer_test_promo[['Date','id']]
del(beer_test['Date'])
del(beer_test_promo['Date'])
del(beer_train_promo['id'])
del(beer_train['id'])

In [348]:
id_tag = beer_test['id']
id_tag_promo = beer_test_promo['id']

del(beer_test['id'])
del(beer_test_promo['id'])


In [349]:
print(beer_train.columns)

Index(['Store', 'SKU', 'Segment', 'Pack', 'Product', 'Brand', 'Volume',
       'day_of_week', 'month', 'year', 'week', 'Days_since_promo',
       'promo_in_other_store', 'is_holiday'],
      dtype='object')


In [350]:
master_df= pd.DataFrame(columns=['id', 'Sales'])


model = CatBoostRegressor(learning_rate=0.001,n_estimators=10000,random_strength=10,random_state=42,
                          one_hot_max_size=2000,depth=10,l2_leaf_reg=0.5,cat_features=[0,1,2,3,4,5,7,8,10],silent=True)
model.fit(beer_train,train_y)


<catboost.core.CatBoostRegressor at 0x1dc9a656af0>

In [351]:
prediction = model.predict(beer_test)
dataset = pd.DataFrame({'id': id_tag, 'Sales': prediction}, columns=['id', 'Sales'])
master_df =  pd.concat([master_df, dataset], ignore_index=True)

In [352]:
print(beer_train_promo.columns)

Index(['isPromo', 'Store', 'SKU', 'Volume', 'day_of_week', 'month', 'year',
       'week', 'store_promo', 'brand_promo', 'sku_promo',
       'promo_in_other_store', 'is_holiday'],
      dtype='object')


In [353]:
model1 = CatBoostRegressor(learning_rate=0.001,n_estimators=10000,random_strength=10,random_state=42,
                          one_hot_max_size=2000,depth=10,l2_leaf_reg=0.5,cat_features=[1,2,4,5,7,8,9,10],silent=True)
model1.fit(beer_train_promo,train_y_promo)

<catboost.core.CatBoostRegressor at 0x1dc9abb8490>

In [354]:
prediction = model1.predict(beer_test_promo)



dataset = pd.DataFrame({'id': id_tag_promo, 'Sales': prediction}, columns=['id', 'Sales'])

master_df =  pd.concat([master_df, dataset], ignore_index=True)

In [355]:

master_df.shape

dates_id = pd.concat([dates_test,dates_test_promo])

dates_id.shape

master_df = pd.merge(master_df,dates_id,on='id')

master_df.shape

master_df['Date'] = master_df['Date'].astype(str)

master_df['Date'].dtype

d1 = np.datetime64('2017-06-17')
arr = [d1.astype(str)]
for i in range(1,16):
    day = d1+np.timedelta64(i,'D')
    arr.append(day.astype(str))

master_df.loc[master_df['Date'].isin(arr),'Sales'] = master_df.loc[master_df['Date'].isin(arr),'Sales']*1.23


master_df.head()


master_df = master_df.drop(columns=['Date'])
predictions_catboost = master_df

# Stacking

In [356]:
predictions_lgbm['id'] = predictions_lgbm['id'].astype(int)
predictions_catboost['id'] = predictions_catboost['id'].astype(int)

In [357]:
predictions_lgbm = predictions_lgbm.sort_values('id')
predictions_catboost = predictions_catboost.sort_values('id')

In [358]:
stack = pd.DataFrame(columns=['id','Sales'])

In [359]:
stack['Sales'] = 0.4*predictions_catboost['Sales'] + 0.6*predictions_lgbm['Sales'] 
stack['id'] = predictions_lgbm['id']

In [360]:
stack.loc[stack['Sales']<0,'Sales']=0

In [361]:
stack.to_csv('submission_final.csv',index=False)