# Library

In [None]:
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier
import lightgbm as lgb
from time import time
from tqdm import tqdm
from collections import Counter
from scipy import stats
import gc, pickle
import ast

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.linear_model import Ridge,Lasso, BayesianRidge
from sklearn.svm import LinearSVR
from sklearn.preprocessing import minmax_scale

In [None]:
%matplotlib inline

# Preprocessing

In [None]:
def create_is_sell_data(sell_prices_df, calendar_df, train_df):
    sell_prices_df['id'] = sell_prices_df['item_id'].astype('str')+'_'+sell_prices_df['store_id']+'_validation'
    sell_prices_data = sell_prices_df[sell_prices_df.wm_yr_wk.isin(calendar_df.wm_yr_wk.unique())]
    sell_prices_data.reset_index(drop=True, inplace=True)
    tmp = sell_prices_data.groupby(['id'])[['wm_yr_wk', 'sell_price']].apply(
        lambda x: x.set_index('wm_yr_wk')['sell_price'].to_dict()
    ).to_dict()
    d = calendar_df.d
    wm_yr_wk = calendar_df.wm_yr_wk
    price_data = {}
    for col in tqdm(train_df.id.unique()):
        price_data[col] = wm_yr_wk.map(tmp[col])
    price_data = pd.DataFrame(price_data)
    price_data.index = d
    is_sell = price_data.notnull().astype(float).T
    price_data = price_data.fillna(0)
    
    is_sell.index=train_df.id
    train_df.index=train_df.id
    is_sell = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], is_sell
    ], axis=1)
    price_data = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], price_data.T  
    ], axis=1)
    
    return price_data, is_sell

def set_index(df, name):
    d = {}
    for col, value in df.iloc[0,:].items():
        if type(col)==str:
            if type(df[col].values[0])!=str:
                v = 'd'
            else:
                v='id'
        else:
            v=name
        d[col]=v
    return d

def dcol2int(col):
    if col[:2]=='d_':
        return int(col.replace('d_', ''))
    else:
        return col
    
def create_event_data(train_df, calendar_df):
    new_df = pd.DataFrame()
    D_COLS = [d for d in train_df.columns if type(d)!=str]
    for event_name in ['event_name_1', 'event_name_2']:
        tmp_df = pd.concat([
            train_df.groupby(['dept_id'])[D_COLS].mean().T.astype(float),
            train_df.groupby(['cat_id'])[D_COLS].mean().T.astype(float),
            calendar_df.loc[D_COLS,event_name].replace(np.nan, 'NAN')
        ],axis=1)

        dept_id_cols = train_df.dept_id.unique().tolist()
        cat_id_cols = train_df.cat_id.unique().tolist()

        tmp_df = pd.concat([
            tmp_df[[event_name]],
            tmp_df.groupby([event_name])[dept_id_cols].transform(
            lambda x: x.shift(1).rolling(len(x), min_periods=1).mean()
            ),
            tmp_df.groupby([event_name])[cat_id_cols].transform(
            lambda x: x.shift(1).rolling(len(x), min_periods=1).mean()
            )
        ], axis=1)

        tmp_df[dept_id_cols] = tmp_df[dept_id_cols]/tmp_df[dept_id_cols].rolling(56, min_periods=1).mean().shift(1)
        tmp_df[cat_id_cols] = tmp_df[cat_id_cols]/tmp_df[cat_id_cols].rolling(56, min_periods=1).mean().shift(1)
        tmp_df.loc[tmp_df[event_name]=='NAN', dept_id_cols+cat_id_cols]=1
        
        tmp_df.columns=[f'{event_name}_{col}' for col in tmp_df.columns]
        
        new_df = pd.concat([
            new_df, tmp_df
        ] ,axis=1)
    new_df.index=D_COLS
    return new_df

In [None]:
def create_metadata(path, d_cols, submmit=True):
    train_df = pd.read_csv(path+'sales_train_validation.csv')
    calendar_df = pd.read_csv(path+'calendar.csv')
    sell_prices_df = pd.read_csv(path+'sell_prices.csv')
    sample_submission_df = pd.read_csv(path+'sample_submission.csv')

    calendar_df['d'] = calendar_df.d.str.replace('d_', '').astype(int)
    cols = train_df.columns
    cols = [dcol2int(col) for col in cols]
    train_df.columns=cols
    calendar_df['date']=pd.to_datetime(calendar_df.date)
    calendar_df.index = calendar_df.d
    price_data, is_sell = create_is_sell_data(sell_prices_df, calendar_df, train_df)
    
    str_cols = [ col for col in train_df.columns if 'id' in str(col)]
    new_columns = str_cols+d_cols
    train_df = train_df.reindex(columns=new_columns)
    
    
    train_df = pd.concat([
        train_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']],
        train_df.loc[:,d_cols]+is_sell[d_cols].replace(0, np.nan).replace(1, 0)
    ], axis=1)
    train_df.index = train_df.id
    del is_sell;gc.collect()
    
    df = train_df.loc[:,d_cols].T.astype(float)
    a = df.loc[d_cols[28:-56]].rolling(28, min_periods=1).sum().replace(0,np.nan)+df.loc[d_cols[28:-56]][::-1].rolling(28, min_periods=1).sum()[::-1].replace(0,np.nan)
    a[a.notnull()]=0
    df.loc[d_cols[28:-56]] += a
    df = df.loc[d_cols,:].T.astype(float)
    del a;gc.collect()
    
    #snap_data
    snap_data = calendar_df[['snap_CA', 'snap_WI', 'snap_TX', 'd']]
    snap_data.set_index('d', inplace=True)
    
    #dept_id_price
    dept_id_price = price_data[d_cols]/price_data.groupby(['dept_id', 'store_id'])[d_cols].transform('mean')
    dept_id_price = dept_id_price.T.astype(float)
    #dept_id_price['d'] = dept_id_price.index
    dept_id_price = dept_id_price.replace(0,np.nan)
    
    #cat_id_price
    cat_id_price = price_data[d_cols]/price_data.groupby(['cat_id', 'store_id'])[d_cols].transform('mean')
    cat_id_price = cat_id_price.T.astype(float)
    #cat_id_price['d'] = cat_id_price.index
    cat_id_price = cat_id_price.replace(0,np.nan)
    
    #price_data
    price_data = price_data[d_cols].T
    price_data.replace(0,np.nan, inplace=True)
    #price_data['d']=price_data.index
    
    #event_df
    event_df = create_event_data(train_df, calendar_df)
    #event_df.reset_index(inplace=True)
    
    #calendar_dict
    calendar_dict = calendar_df[['wday', 'month']].to_dict()
    
    return train_df, snap_data, dept_id_price, cat_id_price, price_data, event_df, calendar_dict, df

# feature engineering

In [None]:
def make_roll_data(data, win, agg={'mean', 'std'}):
    data_2 = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        x.shift(1).rolling(win, min_periods=1).agg(agg)
    )
    data_2.columns=[f'roll_{win}_{col}' for col in data_2.columns]
    data = pd.concat([
        data, data_2
    ], axis=1)
    return data

def make_diff_data(data, win):
    diff_data = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        abs(x.shift(1).diff(1)).rolling(win, min_periods=1).agg({'mean', 'std'})
    ) 
    diff_data.columns=[f'diff_{col}_{win}_1' for col in diff_data.columns]
    data = pd.concat([
        data, diff_data
    ], axis=1)
    
    diff_data = data.groupby(['id'])['TARGET'].apply(
        lambda x:
        abs(x.shift(1).diff(7)).rolling(win, min_periods=1).agg({'mean', 'std'})
    ) 
    diff_data.columns=[f'diff_{col}_{win}_7' for col in diff_data.columns]
    data = pd.concat([
        data, diff_data
    ], axis=1)
    return data

def make_shift_data(data):
    shift=7
    for i, p in  enumerate([0,7]):
        data[f'shift_{i+1}'] = data.groupby(['id'])['TARGET'].shift(shift+p)
    data['shift_3'] = data[['shift_1', 'shift_2']].mean(1)
    return data

def preprocessing(path,d_cols,test):
    """
    if test:
        train_df = pd.read_csv(path+'train_df_short.csv')
    else:
        train_df = pd.read_csv(path+'train_df.csv')
    train_df.columns= [int(col) if col.isnumeric() else str(col) for col in train_df.columns]
    """
    train_df, snap_data, dept_id_price, cat_id_price, price_data, event_df, calendar_dict, df = create_metadata(path, d_cols)
    if test:
        train_df = train_df[train_df.id.isin(train_df.id.unique()[:2000])]
    data = pd.concat([
        train_df[['id', 'dept_id', 'store_id']],
        df[d_cols[:-28]].isnull().sum(axis=1),
        df[d_cols[:-28]].mean(1)
    ],axis=1)
    data.columns=['id', 'dept_id', 'store_id', 'null_num_600', 'sell_mean']
    data['sell_mean_null_600'] = data['sell_mean']/data['null_num_600']
    data = data.sort_values('sell_mean_null_600', ascending=False)#.index.tolist()
    
    ids = []
    for dept in ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']:
        ids += data[data.dept_id==dept][:500].index.tolist()
    for store in ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']:
        ids += data[data.store_id==store][:500].index.tolist()
    ids += data.index.tolist()[:2000]
    ids = np.unique(ids).tolist()
    ids_2 = data[~data.id.isin(ids)].id.unique().tolist()
    gc.collect()
    
    print('len  ids  ', len(ids))
    print('len  ids_2  ', len(ids_2))
    
    data = pd.concat([
        train_df[train_df.id.isin(ids)][d_cols[-200:]].stack(dropna=False).reset_index(),
        train_df[train_df.id.isin(ids_2)][d_cols[-100:]].stack(dropna=False).reset_index()
        ], axis=0)
    data = data.rename(columns=set_index(data, 'TARGET'))
    data.sort_values('d', inplace=True)
    data.reset_index(drop=True, inplace=True)
    data = reduce_mem_usage(data)
    gc.collect()


    for key, value in train_df[['dept_id', 'cat_id', 'state_id', 'store_id']].to_dict().items():
        data[key] = data.id.map(value)
    
    
    #snap_data = pd.read_csv(path+'snap_data.csv')
    #snap_data.index=snap_data.d
    #snap_data.drop('d',axis=1, inplace=True)
    
    data[f'snap']=0
    for key, value in snap_data.to_dict().items():
        k = key.replace('snap_', '')
        data.loc[data.state_id==k,'snap'] = data.loc[data.state_id==k, 'd'].map(value).fillna(0)
    for shift in [-3,-2,-1,1,2,3]:
        data[f'snap_{shift}'] = data.groupby(['id'])['snap'].shift(shift).fillna(0)


    #dept_id_price = pd.read_csv(path+'dept_id_price.csv')
    #cat_id_price = pd.read_csv(path+'cat_id_price.csv')
    #dept_id_price.index=dept_id_price.d
    #cat_id_price.index=cat_id_price.d
    #dept_id_price = dept_id_price[dept_id_price.d.isin(data.d.unique())].drop('d', axis=1)
    #cat_id_price = cat_id_price[cat_id_price.d.isin(data.d.unique())].drop('d', axis=1)

    dept_id_price = dept_id_price.stack(dropna=False).reset_index()
    cat_id_price = cat_id_price.stack(dropna=False).reset_index()

    dept_id_price.rename(columns=set_index(dept_id_price, 'dept_id_price'), inplace=True)
    cat_id_price.rename(columns=set_index(cat_id_price, 'cat_id_price'), inplace=True)

    data = pd.merge(
        data, dept_id_price, on=['d', 'id'], how='left'
    )
    data = pd.merge(
        data, cat_id_price, on=['d', 'id'], how='left'
    )


    del dept_id_price,cat_id_price;gc.collect()

    #price_df = pd.read_csv(path+'price_data.csv')
    #price_df.index=price_df.d
    #price_df = price_df[price_df.d.isin(data.d.unique())].drop('d', axis=1)
    price_data = price_data.stack(dropna=False).reset_index()
    price_data.rename(columns=set_index(price_data, 'price'), inplace=True)
    data = pd.merge(
        data, price_data, on=['d', 'id'], how='left'
    )
    del price_data;gc.collect()

    #with open(path+'calendar_dict.pkl', 'rb') as f:
     #   calendar_dict = pickle.load(f)
    for key, value in calendar_dict.items():
        data[key] = data.d.map(value)
    del calendar_dict;gc.collect()

    #event_df = pd.read_csv(path+'event_df.csv')
    #event_df.index=event_df['index']
    #event_df.drop('index', axis=1, inplace=True)

    tmp_dic = event_df.to_dict()
    data[f'dept_id_event_name_1']=1
    data[f'cat_id_event_name_1']=1
    for key, value in tmp_dic.items():
        if 'event_name_1' in key:
            if key[13:] in train_df.dept_id.unique().tolist():
                data.loc[data.dept_id==key[13:], f'dept_id_{key[:12]}']=data.loc[data.dept_id==key[13:], 'd'].map(value).fillna(1)
            if key[13:] in train_df.cat_id.unique().tolist():
                data.loc[data.cat_id==key[13:], f'cat_id_{key[:12]}']=data.loc[data.cat_id==key[13:], 'd'].map(value).fillna(1)
    for shift in [-3,-2,-1,1,2,3]:
        for event_name in ['dept_id_event_name_1', 'cat_id_event_name_1']:
            data[f'{event_name}_shift{shift}'] = data.groupby(['id'])[event_name].shift(shift).fillna(1)

    cols = data.columns.tolist()
    print(cols)

    data = make_roll_data(data=data,win=28,agg={'mean', 'std', 'skew'})
    data = make_roll_data(data=data,win=7,agg={'mean', 'min', 'max'})
    data = make_roll_data(data,win=56,agg={'std', 'skew'})
    data = make_diff_data(data=data, win=28)
    data = make_diff_data(data=data, win=7)
    data = make_shift_data(data=data)
    gc.collect()

    print([col for col in data.columns if not col in cols])
    
    categories = [c for c in data.columns if data[c].dtype==object]
    print(categories)
    for c in categories:
        if c=='id':
            pass
        else:
            data[c] = pd.factorize(data[c])[0]
    
    return data

### reduce_mem_usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### DF shift 

In [None]:
def shift_seven(data, cols):
    #['shift_1', 'shift_2', 'shift_3', 'shift_4']
    data[cols] = data.groupby(['id'])[cols].shift(7)
    return data
    
def shift_one(data,cols):
    #['roll_28_std', 'roll_28_mean', 'diff_std_1', 'diff_mean_1', 'diff_std_7', 'diff_mean_7']
    data[cols]=data.groupby(['id'])[cols].shift(1)
    return data

# lgb model utils

In [None]:
PARAMS = {
    'n_estimators':2000,
    'boosting_type': 'gbdt',
    
    'objective': 'poisson',
    'metric': 'rmse',
    
    'subsample': 0.75,
    'subsample_freq': 1,
    'learning_rate': 0.07,
    'feature_fraction': 0.85,
    'max_depth': 15,
    'lambda_l1': 1,  
    'lambda_l2': 1,
    'verbose': 100,
    'random_state':123
}


def plot_importance(models, col, name):
    importances = np.zeros(len(col))
    for model in models:
        importances+=model.feature_importance(importance_type='gain')
    importance = pd.DataFrame()
    importance['importance'] = importances
    importance['importance'] = minmax_scale(importance.importance)
    importance['col'] = col
    importance.to_csv(f'importance_{name}.csv',index=False)
    return importance
    
def run_nest_cv(x_train, y_train, trn_df, params=PARAMS):
    models = []
    k = GroupKFold(n_splits=5)
    trn_df['y_pred'] = 0
    
    for trn_indx, val_indx in k.split(x_train[['dept_id']],groups=x_train['dept_id']):
        train_set = lgb.Dataset(x_train.loc[trn_indx,:], y_train.loc[trn_indx])
        val_set = lgb.Dataset(x_train.loc[val_indx,:], y_train.loc[val_indx])
        
        categories = ['cat_id', 'dept_id', 'store_id']
        
        model = lgb.train(
            train_set=train_set, 
            valid_sets=[train_set, val_set],
            params=params, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=500,
            categorical_feature=categories+['wday', 'month']
        )
        
        models.append(model)
        trn_df.loc[val_indx, 'y_pred']=np.e**(model.predict(x_train.loc[val_indx,:]))-1
        gc.collect()
        
    return models, trn_df

def predict_cv(x_val, models):
    preds = np.zeros(len(x_val))
    for model in models:
        pred = model.predict(x_val)
        pred = np.e**pred - 1
        preds+=pred/len(models)
    return preds

def show_eval_score(preds, val_df):
    val_df['y_pred'] = preds
    score= np.sqrt(mean_squared_error(val_df['TARGET'], preds))
    print("EVALUATION SCORE : ", score)
    return val_df

def split_data(data, trn_day, val_day):
    data = data[data.shift_2.notnull()]
    
    y = data[['d', 'id', 'TARGET']]
    X = data.drop(columns=['id',  'TARGET','state_id']).astype(float)
    
    x_train, x_val = X[X.d.isin(trn_day)], X[X.d.isin(val_day)]
    y_train, y_val = y[y.d.isin(trn_day)], y[y.d.isin(val_day)]
    
    x_train.reset_index(drop=True,inplace=True)
    x_val.reset_index(drop=True,inplace=True)
    y_train.reset_index(drop=True,inplace=True)
    y_val.reset_index(drop=True,inplace=True)
    trn_df = y_train[['id', 'd', 'TARGET']]
    val_df = y_val[['id', 'd', 'TARGET']]
    y_train['TARGET'] = np.log1p(y_train['TARGET'])
    
    x_train.drop('d', axis=1, inplace=True)
    x_val.drop('d', axis=1, inplace=True)
    y_train = y_train['TARGET'].astype(float)
    return x_train, x_val, y_train, trn_df, val_df

def train(data):
    split=28
    data = data[data.TARGET.notnull()]
    d_cols = sorted(data.d.unique())
    trn_day = d_cols[:-split]
    val_day = d_cols[-split:]

    x_train, x_val, y_train, trn_df, val_df = split_data(data, trn_day, val_day)
    print(x_train.shape, x_val.shape)
    models, trn_df = run_nest_cv(x_train, y_train, trn_df)
    preds = predict_cv(x_val, models)
    val_df = show_eval_score(preds, val_df)
    plot_importance(models, x_train.columns)
    return models, val_df, trn_df

def split_data_for_sub(data):
    data = data[data.TARGET.notnull()]
    data = data[data.shift_2.notnull()]
    data = data[data.diff_std_7_1.notnull()]
    trn_df = data[['id', 'd', 'TARGET']]
    y = np.log1p(data['TARGET']).astype(float)
    X = data.drop(columns=['id','d', 'TARGET','state_id']).astype(float)
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    trn_df.reset_index(drop=True, inplace=True)
    return X, y, trn_df

def train_sub_predict(data, for_predict):
    train_d_cols = data.d.unique().tolist()
    predict_day=train_d_cols[-28:][for_predict-1]
    sub_predict_data = data[data.d==predict_day]
    X, y, trn_df = split_data_for_sub(data)
    print(X.shape)
    models, trn_df = run_nest_cv(X, y, trn_df)
    plot_importance(models, X.columns, str(for_predict))
    preds = predict_cv(sub_predict_data[X.columns], models)
    
    sub_df = sub_predict_data[['id', 'd', 'TARGET']]
    sub_df[f'y_pred'] = preds
    return trn_df, sub_df

# linear model utils

In [None]:
def to_onehot_data(data):
    data = data.drop(columns=['state_id'])
    category = ['cat_id', 'dept_id', 'store_id', 'month', 'wday']
    one_hot_cols=[]
    for cat in category:
        one_hot_data = pd.get_dummies(data[cat]).rename(columns={i:f'{cat}_{int(i)}' for i in data[cat].unique()})
        one_hot_cols+=one_hot_data.columns.tolist()
        data = pd.concat([
            data.drop(cat, axis=1),
            one_hot_data
        ], axis=1)
    return data, one_hot_cols

def data_split_lin(data, trn_days, val_days):
    data.dropna(0, inplace=True)
    train = data[data.d.isin(trn_days)]
    val = data[data.d.isin(val_days)]
    train.reset_index(drop=True, inplace=True)
    val.reset_index(drop=True, inplace=True)
    trn_df = train[['id', 'd', 'TARGET']]
    val_df = val[['id', 'd', 'TARGET']]
    return train, val, trn_df, val_df

def linear_cv(data, trn_df):
    k = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
    models={}
    models['ridge'] = []
    models['lasso'] = []
    data.reset_index(drop=True, inplace=True)
    X = data.drop(columns=['id', 'd', 'TARGET'])
    y = data['TARGET']
    data['ridge_preds'] = 0
    data['lasso_preds'] = 0
    for trn_indx, val_indx in k.split(data,y=y):
        
        ridge = Ridge()
        lasso = Lasso()
    
        ridge.fit(X.loc[trn_indx,:],y.loc[trn_indx])
        lasso.fit(X.loc[trn_indx,:],y.loc[trn_indx])
        models['ridge'].append(ridge)
        models['lasso'].append(lasso)
        
        trn_df.loc[val_indx, 'ridge_preds'] = ridge.predict(X.loc[val_indx,:])
        trn_df.loc[val_indx, 'lasso_preds'] = lasso.predict(X.loc[val_indx,:])
    
    return models, trn_df

def cv_predict_lin(data, models):
    preds = np.zeros(len(data))
    for model in models:
        preds+=model.predict(data.drop(columns=['id', 'd', 'TARGET'])) /len(models)
    return preds
    
def linear_predict(models, X, val_df):
    for name, _models in models.items():
        val_df[f'{name}_preds'] = cv_predict_lin(X, _models)
    return val_df

def train_lin(data, trn_days, val_days):
    X = to_onehot_data(data)
    train, val, trn_df, val_df = data_split_lin(X, trn_days, val_days)
    print(train.shape, val.shape)
    models, trn_df = linear_cv(train, trn_df)
    val_df = linear_predict(models, val, val_df)
    return val_df, trn_df

def train_lin_sub(data, for_predict):
    predict_day = data.d.unique()[-28:][for_predict-1]
    X, one_hot_cols = to_onehot_data(data)
    predict_sub_df = X[X.d==predict_day]
    
    X.dropna(0, inplace=True)
    X.reset_index(drop=True, inplace=True)
    predict_sub_df.reset_index(drop=True, inplace=True)

    trn_df = X[['id', 'd', 'TARGET']+one_hot_cols]
    val_df = predict_sub_df[['id', 'd', 'TARGET']+one_hot_cols]

    models, trn_df = linear_cv(X, trn_df)
    val_df = linear_predict(models, predict_sub_df, val_df)
    return val_df, trn_df, one_hot_cols

# SVM, BayesianRidge

In [None]:
def cv_bayesianRidge(trn_df ,val_df, one_hot_cols):
    k = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
    X = trn_df[trn_df.ridge_preds.notnull()][['id', 'd','y_pred','ridge_preds', 'lasso_preds']+one_hot_cols]
    y = trn_df[trn_df.ridge_preds.notnull()]['TARGET']
    X['br_pred'] = 0
    val_df['br_pred'] = 0
    X['br_std'] = 0
    val_df['br_std'] = 0
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    for trn_indx, val_indx in k.split(X,y=y):
        
        br = BayesianRidge()
        br.fit(X.loc[trn_indx,['y_pred','ridge_preds', 'lasso_preds']+one_hot_cols], y.loc[trn_indx])
        p, _std= br.predict(X.loc[val_indx,['y_pred','ridge_preds', 'lasso_preds']+one_hot_cols], return_std=True)
        X.loc[val_indx, 'br_pred'] = p 
        X.loc[val_indx, 'br_std'] = _std
        p, _std= br.predict(val_df[['y_pred','ridge_preds', 'lasso_preds']+one_hot_cols], return_std=True)
        val_df['br_pred'] += p/5
        val_df['br_std'] += _std/5
    trn_df = pd.merge(trn_df, X[['id', 'd','br_pred', 'br_std']], how='outer', on=['id', 'd'])
    return trn_df, val_df

# PipeLine

In [None]:
def train_cv_pipeline(data):
    models, val_df, trn_df = train(data)
    
    lin_data = data.drop(columns=['roll_28_skew', 'roll_56_skew'])
    val_df_lin, trn_df_lin = train_lin(lin_data, trn_df.d.unique().tolist(), val_df.d.unique().tolist())
    
    val_df = pd.merge(val_df, val_df_lin, how='outer', on=['id', 'TARGET', 'd'])
    trn_df = pd.merge(trn_df, trn_df_lin, how='outer', on=['id', 'TARGET', 'd'])

    return val_df, trn_df

def predict_sub_pipeline(data, for_predict):
    trn_df, sub_df = train_sub_predict(data, for_predict)
    
    lin_data = data.drop(columns=['roll_28_skew', 'roll_56_skew'])
    sub_df_lin, trn_df_lin, one_hot_cols = train_lin_sub(lin_data, for_predict)
    
    sub_df = pd.merge(sub_df, sub_df_lin, how='outer', on=['id', 'TARGET', 'd'])
    trn_df = pd.merge(trn_df, trn_df_lin, how='outer', on=['id', 'TARGET', 'd'])

    return sub_df,trn_df,one_hot_cols


def all_predict_run(data, for_predict):
    print(F"""
    ###################################
               TRAIN {for_predict}
    ###################################
    """)
    
    s = time()
    sub_df,trn_df,one_hot_cols = predict_sub_pipeline(data, for_predict)
    trn_df, sub_df = cv_bayesianRidge(trn_df ,sub_df, one_hot_cols)
    #trn_df.to_csv(f'trn_{for_predict}.csv', index=False)
    sub_df[['id', 'd', 'TARGET', 'y_pred','br_pred', 'br_std']].to_csv(f'sub_{for_predict}.csv', index=False)
    s = (time()-s)/60
    print(f'TIME :  {s :.4}  \n')
    
def sub_cycle(path, d_cols, stop=None, test=False):
    data = preprocessing(path,d_cols,test=test)
    
    shift_seven_cols = ['shift_1', 'shift_2', 'shift_3']
    shift_one_cols = ['roll_28_skew', 'roll_28_mean', 'roll_28_std', 'roll_7_max',
                      'roll_7_mean', 'roll_7_min', 'roll_56_skew', 'roll_56_std', 
                      'diff_mean_28_1', 'diff_std_28_1', 'diff_mean_28_7', 'diff_std_28_7',
                      'diff_mean_7_1', 'diff_std_7_1', 'diff_mean_7_7', 'diff_std_7_7']
    
    all_predict_run(data=data, for_predict=1)
    print('\n')
    
    if stop is not None:
        for for_predict in range(2,stop):
            if (for_predict-1)%7==0:
                data = shift_seven(data, shift_seven_cols)
            data = shift_one(data, shift_one_cols)
            all_predict_run(data=data, for_predict=for_predict)
    else:
        for for_predict in range(2,29):
            if (for_predict-1)%7==0:
                data = shift_seven(data, shift_seven_cols)
            data = shift_one(data, shift_one_cols)
            all_predict_run(data=data, for_predict=for_predict)

In [None]:
%%time
#path = '/Users/kanoumotoharu/Downloads/m5-forecasting-accuracy/'
#path = '/Users/abcdm/Downloads/m5-forecasting-accuracy/'
path = '../input/m5-forecasting-accuracy/'

d_cols=[i+1 for i in range(1941)]
#sub_cycle(path=path, d_cols=d_cols[-1000:],stop=2, test=False)
sub_cycle(path=path, d_cols=d_cols[-1000:], test=False)