<a href="https://colab.research.google.com/github/GavinLi2/Gavin.github.io/blob/master/Main_Training_Codes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# %load ML_diplay.py
"""
机器学习建模
"""
import pandas as pd
import numpy as np
import sys
import os
import pickle
import lightgbm as lgb
from  datetime import datetime, timedelta
import random
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
path = "/content/drive/My Drive/1 - Walmart M5 Accuracy" 
os.chdir(path)

In [0]:
from typing import Union
from tqdm.auto import tqdm as tqdm

class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

In [0]:
def create_train_data(train_start=750,test_start=1800,is_train=True):
    # 基本参数
    PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
    CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
            "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
            "month": "int8", "year": "int16", "snap_CA": "int8", 'snap_TX': 'int8', 'snap_WI': 'int8' }

    start_day = train_start if is_train else test_start
    numcols = [f"d_{day}" for day in range(start_day,1914)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    SALE_DTYPES = {numcol: "float32" for numcol in numcols} 
    SALE_DTYPES.update({col: "category" for col in catcols if col != "id"})

    # 加载price数据
    price_data = pd.read_csv('sell_prices.csv',dtype=PRICE_DTYPES)
    # 加载cal数据
    cal_data = pd.read_csv('calendar.csv',dtype=CAL_DTYPES)
    # 加载sale数据
    sale_data = pd.read_csv('sales_train_validation.csv',dtype=SALE_DTYPES,usecols=catcols+numcols)

    # 类别标签转换
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            price_data[col] = price_data[col].cat.codes.astype("int16") 
            #↑cat方法将category转换为CategoricalAccessor对象，codes方法以Series返回该对象的所有类名及对应索引，用于将类标签转为数值
            price_data[col] -= price_data[col].min() #令类标签从0开始

    cal_data["date"] = pd.to_datetime(cal_data["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal_data[col] = cal_data[col].cat.codes.astype("int16")
            cal_data[col] -= cal_data[col].min()

    for col in catcols:
        if col != "id":
            sale_data[col] = sale_data[col].cat.codes.astype("int16")
            sale_data[col] -= sale_data[col].min()

    if not is_train:
        for day in range(1913+1, 1913+ 2*28 +1):
            sale_data[f"d_{day}"] = np.nan
    # melt函数：将数据分为三部分：①id-like的列；②variables列：将指定列的列名，融合到一列中；③values列：将被融合的列的值，与融合后的列一一对应
    sale_data = pd.melt(sale_data,
            id_vars = catcols,
            value_vars = [col for col in sale_data.columns if col.startswith("d_")],
            var_name = "d",
            value_name = "sales")
    sale_data = sale_data.merge(cal_data, on= "d", copy = False)
    sale_data = sale_data.merge(price_data, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

    # 处理时间特征
    # 有的时间特征没有，通过datetime的方法自动生成
    date_features = {
            "wday": "weekday",
            "week": "weekofyear",
            "month": "month",
            "quarter": "quarter",
            "year": "year",
            "mday": "day",}

    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in sale_data.columns:
            pass
        else:
            sale_data[date_feat_name] = getattr(sale_data["date"].dt, date_feat_func).astype("int16")

    sale_data.drop(["wm_yr_wk", "weekday"],axis=1,inplace=True)

    return sale_data


def create_feature(sale_data, is_train=True, day=None):
    # 可以在这里加入更多的特征抽取方法
    # 获取7天前的数据，28天前的数据
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]

    # 如果是测试集只需要计算一天的特征，减少计算量
    # 注意训练集和测试集特征生成要一致
    if is_train:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data[lag_col] = sale_data[["id","sales"]].groupby("id")["sales"].shift(lag)
    else:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data.loc[sale_data.date == day, lag_col] = sale_data.loc[sale_data.date ==day-timedelta(days=lag), 'sales'].values
            # day变量是指需要预测的日期。这里直接取需要预测的日期对应的lag的数据，而没有用shift函数，减小了计算量（不用冗余地处理非预测日期的数据）  


    # 将获取7天前的数据，28天前的数据做移动平均
    wins = [7, 28]

    if is_train:
        for win in wins :
            for lag,lag_col in zip(lags, lag_cols):
                sale_data[f"rmean_{lag}_{win}"] = sale_data[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()).astype('float32')
    else:
        for win in wins:
            for lag in lags:
                # 取lag天前，窗口大小为win的日期的数据【i.e.取两个时间节点间的数据】
                df_window = sale_data[(sale_data.date <= day-timedelta(days=lag)) & (sale_data.date > day-timedelta(days=lag+win))]
                df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(sale_data.loc[sale_data.date==day,'id'])['sales']
                sale_data.loc[sale_data.date == day, f"rmean_{lag}_{win}"] = df_window_grouped.astype('float32').values
    return sale_data


def train_model(train_data,valid_data):
    params = {"objective" : "tweedie",
              "metric" :"rmse",
              "seed" : 666,
              "force_row_wise" : True,
              "learning_rate" : 0.075,
              "sub_feature" : 0.8,
              "sub_row" : 0.75,
              "bagging_freq" : 1,
              "lambda_l2" : 0.1,
              "nthread": 8,
              "tweedie_variance_power":1.2,
              'verbosity': 1,
              'num_iterations' : 1500,
              'num_leaves': 128,
              "min_data_in_leaf": 104,
              'early_stopping_rounds' : 100
              }

    m_lgb = lgb.train(params, train_data, valid_sets = [train_data, valid_data], verbose_eval=50)

    return m_lgb


def predict_ensemble(train_cols,m_lgb):
    date = datetime(2016,4,25) 
    # 选择要乘以的系数
    alphas = [1.035, 1.03, 1.025]
    weights = [1/len(alphas)]*len(alphas)
    sub = 0.

    test_data = create_train_data(is_train=False)

    for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

        test_data_c = test_data.copy()
        cols = [f"F{i}" for i in range(1,29)]


        for i in range(0, 28):
            day = date + timedelta(days=i)
            print(i, day)
            tst = test_data_c[(test_data_c.date >= day - timedelta(days=57)) & (test_data_c.date <= day)].copy()
            tst = create_feature(tst,is_train=False, day=day)
            tst = tst.loc[tst.date == day , train_cols]
            test_data_c.loc[test_data_c.date == day, "sales"] = alpha*m_lgb.predict(tst)

        # 改为提交数据的格式
        test_sub = test_data_c.loc[test_data_c.date >= date, ["id", "sales"]].copy()
        test_sub["F"] = [f"F{rank}" for rank in test_sub.groupby("id")["id"].cumcount()+1]
        test_sub = test_sub.set_index(["id", "F"]).unstack()["sales"][cols].reset_index()
        test_sub.fillna(0., inplace = True)
        test_sub.sort_values("id", inplace = True)
        test_sub.reset_index(drop=True, inplace = True)
        # test_sub.to_csv(f"submission_{icount}.csv",index=False)
        if icount == 0 :
            sub = test_sub
            sub[cols] *= weight
        else:
            sub[cols] += test_sub[cols]*weight
        print(icount, alpha, weight)
    
    sub2 = sub.copy()
    # 把大于28天后的validation替换成evaluation
    sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
    sub = pd.concat([sub, sub2], axis=0, sort=False)
    sub.to_csv("submission_v13.csv",index=False)

In [0]:
# 辅助函数
# 1.内存优化
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col != 'sales':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min >= np.iinfo(np.uint8).min and c_max <= np.iinfo(np.uint8).max:
                        df[col] = df[col].astype(np.uint8)
                    elif c_min >= np.iinfo(np.uint16).min and c_max <= np.iinfo(np.uint16).max:
                        df[col] = df[col].astype(np.uint16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# 2.RMSSE评估
with open('/content/drive/My Drive/1 - Walmart M5 Accuracy/Evaluation/submission_index.pkl','rb') as f:
    submission_index = pickle.load(f)
with open('/content/drive/My Drive/1 - Walmart M5 Accuracy/Evaluation/valid_id.pkl','rb') as f:
    valid_pred = pickle.load(f)
with open('/content/drive/My Drive/1 - Walmart M5 Accuracy/Evaluation/evaluate.pkl','rb') as f:
    e = pickle.load(f)
def evaluate(model,valid_sets,valid_pred=valid_pred,e=e,submission_index=submission_index,is_early_stopping=False):
    if is_early_stopping:
        valid_pred['sales'] = model.predict(valid_sets,num_iteration=model.best_iteration)
    else:
        valid_pred['sales'] = model.predict(valid_sets)
    valid_pred = valid_pred.set_index(['id','d']).unstack()['sales']
    valid_pred = valid_pred.reindex(submission_index).values
    score = e.score(valid_pred)
    return score

# 3.Seed
def seed_all(seed=666):
    random.seed(seed)
    np.random.seed(seed)

# 4.FI Test
def fi_test(model,valid_sets,base,test_cols=None,is_early_stopping=False):
    print('Base WRMSSE:',base)
    print('-'*5,'Permutation Features Importance','-'*5)
    if test_cols:
        for col in test_cols:
            valid = valid_sets.copy() 
            valid[col] = np.random.permutation(valid[col].values)
            permutated = evaluate(model,valid,is_early_stopping=is_early_stopping)
            print(col,np.round(base - permutated, 5))
    else:
        for col in valid_sets.columns:
            valid = valid_sets.copy() 
            valid[col] = np.random.permutation(valid[col].values)
            permutated = evaluate(model,valid,is_early_stopping=is_early_stopping)
            print(col,np.round(base - permutated, 5))

In [0]:
seed_all(666)

sale_data = create_train_data(train_start=1800,is_train=True)
sale_data = create_feature(sale_data)
sale_data = reduce_mem_usage(sale_data)

cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_type_1", "event_name_2", 'event_type_2'] + ['before_events','after_events']
useless_cols = ["id", "date", "sales", "d", "Weight"]
train_cols = sale_data.columns[~sale_data.columns.isin(useless_cols)]

# valid set中有8个新品，存在nan值。为了便于后续WRMSSE计算，先提取验证集再清洗nan【lgb可以处理nan值】
X_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),train_cols]
y_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),'sales']

# 清洗数据，选择需要训练的数据
sale_data.dropna(inplace=True)

X_train = sale_data.loc[sale_data.date <= '2016-03-27',train_cols]
y_train = sale_data.loc[sale_data.date <= '2016-03-27',"sales"]
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feats, free_raw_data=True)

valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_feats, reference=train_data, free_raw_data=True) 

del sale_data
gc.collect()

m_lgb = train_model(train_data,valid_data)
joblib.dump(m_lgb,'lgb_v13.pkl')

# 实例化评估对象
train_df = pd.read_csv('sales_train_validation.csv')
cal_data = pd.read_csv('calendar.csv')
price_data = pd.read_csv('sell_prices.csv')
train_fold_df = train_df.iloc[:,:-28]
valid_fold_df = train_df.iloc[:,-28:].copy()
e = WRMSSEEvaluator(train_fold_df, valid_fold_df, cal_data, price_data)
del train_fold_df, train_df, cal_data, price_data

m_lgb.eval_valid(evaluate_wrmsse)

predict_ensemble(train_cols,m_lgb)

In [0]:
base = evaluate(m_lgb,X_valid)
fi_test(m_lgb,X_valid,base)

Base WRMSSE: 0.4291943513152775
----- Permutation Features Importance -----
item_id -0.71765
dept_id -0.0816
store_id -0.1077
cat_id -0.02091
state_id -0.01364
wday -0.4683
month -0.00107
year 0.0
event_name_1 0.0
event_type_1 0.0
event_name_2 0.0
event_type_2 0.0
snap_CA -0.00278
snap_TX -0.00581
snap_WI -0.03335
before_events 0.0
after_events -0.00841
sell_price -0.0445
week -0.00399
quarter -0.00028
mday -0.09072
lag_7 -0.25349
lag_28 -0.00464
rmean_7_7 -0.24318
rmean_28_7 -0.55458
rmean_7_28 -0.41447
rmean_28_28 -0.06325


In [0]:
'''
本单元格及之后的单元格均为特征工程和效果测试，以create_train_data生成的数据为base
'''
seed_all()
sale_data = create_train_data(1800)
# sale_data = create_train_data(550) # 由于lag56±28会减小更多样本，所以适当补充一些回来
USELESS_COLS = ["id", "date", "sales", "d"]
TARGET = 'sales'
CAT_COLS = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
TRAIN_COLS = sale_data.columns[~sale_data.columns.isin(USELESS_COLS)].tolist()

In [0]:
# 训练集
X_train = sale_data.loc[sale_data.date <= '2016-03-27',TRAIN_COLS]
y_train = sale_data.loc[sale_data.date <= '2016-03-27',TARGET]
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=CAT_COLS, free_raw_data=True)

# 测试集
X_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS]
y_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TARGET]
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=CAT_COLS, free_raw_data=True)

# 建模
lgb_base = train_model(train_data,valid_data)

In [0]:
'''
1. Target Encoding （CV正则化）

结果：效果不佳，CV的target encoding有点过拟合，且用encoded features替换原features后，WRMSSE崩到了0.9+
'''
from sklearn import base
from sklearn.model_selection import KFold
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self,colnames,targetName,n_fold=5,transDtypes=True,verbosity=True):

        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.transDtypes = transDtypes

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=666)



        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)
        
        if self.transDtypes:
            X[col_mean_name] = X[col_mean_name].astype('float16')

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                                    self.targetName,
                                                    np.corrcoef(X[self.targetName].values,
                                                    encoded_feature)[0][1]))
        return X

class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName,transDtypes=True):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        self.transDtypes = transDtypes
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):


        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})

        if self.transDtypes:
            X[self.encodedName] = X[self.encodedName].astype('float16')
        
        return X


In [0]:
train_data = sale_data.loc[sale_data.date <= '2016-03-27',TRAIN_COLS+[TARGET]]
valid_data = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS+[TARGET]]

enc_cols = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id','event_name_1', 'event_type_1']

for col in enc_cols:
    enc = KFoldTargetEncoderTrain(colnames=col, targetName=TARGET,n_fold=3)
    train_data = enc.fit_transform(train_data)
    enc_test = KFoldTargetEncoderTest(train_data,col,col+'_Kfold_Target_Enc')
    valid_data = enc_test.fit_transform(valid_data)
    TRAIN_COLS.append(col+'_Kfold_Target_Enc')

Correlation between the new feature, item_id_Kfold_Target_Enc and, sales is 0.5942211894991276.
Correlation between the new feature, dept_id_Kfold_Target_Enc and, sales is 0.18570158365370654.
Correlation between the new feature, store_id_Kfold_Target_Enc and, sales is 0.0832077315897998.
Correlation between the new feature, cat_id_Kfold_Target_Enc and, sales is 0.13470398722252855.
Correlation between the new feature, state_id_Kfold_Target_Enc and, sales is 0.01659626421855363.
Correlation between the new feature, event_name_1_Kfold_Target_Enc and, sales is 0.008493968512612545.
Correlation between the new feature, event_type_1_Kfold_Target_Enc and, sales is -0.009527891981037954.


In [0]:
train_data.drop(columns=enc_cols,axis=1,inplace=True)
valid_data.drop(columns=enc_cols,axis=1,inplace=True)

X_train = train_data[train_data.columns[train_data.columns != TARGET]]
y_train = train_data[TARGET]

X_valid = valid_data[train_data.columns[train_data.columns != TARGET]]
y_valid = valid_data[TARGET]

del train_data, valid_data, sale_data
gc.collect()

train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=True)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, free_raw_data=True)

lgb_encoded = train_model(train_data,valid_data)

[50]	training's rmse: 2.89761	valid_1's rmse: 2.68914
[100]	training's rmse: 2.8161	valid_1's rmse: 2.65543
[150]	training's rmse: 2.77011	valid_1's rmse: 2.65396
[200]	training's rmse: 2.74229	valid_1's rmse: 2.64667
[250]	training's rmse: 2.71372	valid_1's rmse: 2.63903
[300]	training's rmse: 2.69055	valid_1's rmse: 2.64016
[350]	training's rmse: 2.67194	valid_1's rmse: 2.63621
[400]	training's rmse: 2.65854	valid_1's rmse: 2.63302
[450]	training's rmse: 2.64876	valid_1's rmse: 2.63094
[500]	training's rmse: 2.6369	valid_1's rmse: 2.63126
[550]	training's rmse: 2.62619	valid_1's rmse: 2.62899
[600]	training's rmse: 2.61475	valid_1's rmse: 2.62764
[650]	training's rmse: 2.60616	valid_1's rmse: 2.62795
[700]	training's rmse: 2.59513	valid_1's rmse: 2.62918
[750]	training's rmse: 2.58641	valid_1's rmse: 2.62877
[800]	training's rmse: 2.57878	valid_1's rmse: 2.62965
[850]	training's rmse: 2.56957	valid_1's rmse: 2.62845
[900]	training's rmse: 2.56347	valid_1's rmse: 2.62789
[950]	trainin

In [0]:
score_encoded = evaluate(lgb_encoded,X_valid)
print('WRMSSE OF ENCODED MODEL:',np.round(score_encoded,5))

WRMSSE OF ENCODED MODEL: 0.9311


In [0]:
# ===============================================
# 尝试用smoothing来正则化
def smoothing_target_encoder(df, df_test, column, target, weight=100):
    mean = df[target].mean()

    calculated_df = df.groupby(column)[target].agg(['count','mean'])
    counts = calculated_df['count']
    means = calculated_df['mean']

    smoothed = (counts * means + weight * mean) / (counts + weight)

    col_name = column+'_smooth_encoded' 

    df[col_name] = df[column].map(smoothed)
    df_test[col_name] = df_test[column].map(smoothed)

In [0]:
train_data = sale_data.loc[sale_data.date <= '2016-03-27',TRAIN_COLS+[TARGET]]
valid_data = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS+[TARGET]]

enc_cols = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'event_name_1', 'event_type_1']

for col in enc_cols:
    smoothing_target_encoder(train_data,valid_data,col,TARGET,1000)

X_train = train_data[train_data.columns[train_data.columns != TARGET]]
y_train = train_data[TARGET]

X_valid = valid_data[train_data.columns[train_data.columns != TARGET]]
y_valid = valid_data[TARGET]

del train_data, valid_data, sale_data
gc.collect()

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=CAT_COLS, free_raw_data=True)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=CAT_COLS, reference=train_data, free_raw_data=True)

lgb_encoded_v3 = train_model(train_data,valid_data)

Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 2.66005	valid_1's rmse: 2.39729
[100]	training's rmse: 2.55608	valid_1's rmse: 2.2543
[150]	training's rmse: 2.51022	valid_1's rmse: 2.21304
[200]	training's rmse: 2.47493	valid_1's rmse: 2.20018
[250]	training's rmse: 2.46121	valid_1's rmse: 2.1924
[300]	training's rmse: 2.44979	valid_1's rmse: 2.19067
[350]	training's rmse: 2.43498	valid_1's rmse: 2.1844
[400]	training's rmse: 2.41919	valid_1's rmse: 2.17851
[450]	training's rmse: 2.41233	valid_1's rmse: 2.17645
[500]	training's rmse: 2.40607	valid_1's rmse: 2.17532
[550]	training's rmse: 2.39767	valid_1's rmse: 2.17165
[600]	training's rmse: 2.38951	valid_1's rmse: 2.1695
[650]	training's rmse: 2.38149	valid_1's rmse: 2.17117
[700]	training's rmse: 2.37294	valid_1's rmse: 2.16998
[750]	training's rmse: 2.36039	valid_1's rmse: 2.16571
[800]	training's rmse: 2.3542	valid_1's rmse: 2.16136
[850]	training's rmse: 2.34693	valid_1's rmse: 2.15835
[900]	tr

In [0]:
score_encoded = evaluate(lgb_encoded_v3,X_valid,is_early_stopping=True)
fi_test(lgb_encoded_v3, X_valid,score_encoded, enc_cols + [i+'_smooth_encoded' for i in enc_cols],True)

Base WRMSSE: 0.545424321763537
----- Permutation Features Importance -----
item_id -0.91142
dept_id -0.09455
store_id -0.257
cat_id -0.03645
state_id -0.06378
event_name_1 0.0
event_type_1 0.0
item_id_smooth_encoded -1.37805
dept_id_smooth_encoded -0.20743
store_id_smooth_encoded -0.21184
cat_id_smooth_encoded -0.00703
state_id_smooth_encoded -0.00861
event_name_1_smooth_encoded 0.0
event_type_1_smooth_encoded 0.0


In [0]:
'''
2. lag14和lag28*2；lag14_7，lag14_28，lag_56_7*2，lag56_28*2
'''
def create_feature(sale_data, is_train=True, day=None):
    lags = [14, 56]
    lag_cols = [f"lag_{lag}" for lag in lags ]

    # LAG
    if is_train:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data[lag_col] = sale_data[["id","sales"]].groupby("id")["sales"].shift(lag)
    else:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data.loc[sale_data.date == day, lag_col] = sale_data.loc[sale_data.date == day-timedelta(days=lag), 'sales'].values

    # ROLLING
    wins = [7,28]
    for win in wins:
        if is_train:
            # 处理lag14天的特征
            sale_data[f"rmean_14_{win}"] = sale_data[["id", 'lag_14']].groupby("id")['lag_14'].transform(lambda x : x.rolling(win).mean()).astype('float32')
            # 处理lag56天的特征
            sale_data[f'rmean_56_{win}_2'] = sale_data[['id','lag_56']].groupby('id')['lag_56'].transform(lambda x : x.rolling(1+win*2,center=True).mean()).astype('float32')
        else:
            df_window = sale_data[(sale_data.date <= day-timedelta(days=14)) & (sale_data.date > day-timedelta(days=14+win))]
            df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(sale_data.loc[sale_data.date==day,'id'])['sales']
            sale_data.loc[sale_data.date == day, f"rmean_14_{win}"] = df_window_grouped.astype('float32').values
            
            df_window = sale_data[(sale_data.date <= day-timedelta(days=56-win)) & (sale_data.date > day-timedelta(days=56+win))]
            df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(sale_data.loc[sale_data.date==day,'id'])['sales']
            sale_data.loc[sale_data.date == day, f'rmean_56_{win}_2'] = df_window_grouped.astype('float32').values

    return sale_data

In [0]:
sale_data = create_feature(sale_data)

TRAIN_COLS = sale_data.columns[~sale_data.columns.isin(USELESS_COLS)]

X_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS]
y_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TARGET]

sale_data.dropna(inplace=True)

X_train = sale_data.loc[sale_data.date <= '2016-03-27',TRAIN_COLS]
y_train = sale_data.loc[sale_data.date <= '2016-03-27',TARGET]
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=CAT_COLS, free_raw_data=True)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=CAT_COLS, free_raw_data=True)

# 建模
lgb_timeFeats = train_model(train_data,valid_data)

[50]	training's rmse: 2.47906	valid_1's rmse: 2.23541
[100]	training's rmse: 2.39683	valid_1's rmse: 2.19286
[150]	training's rmse: 2.36184	valid_1's rmse: 2.21447
[200]	training's rmse: 2.33042	valid_1's rmse: 2.22937
[250]	training's rmse: 2.30972	valid_1's rmse: 2.24418
[300]	training's rmse: 2.28953	valid_1's rmse: 2.26875
[350]	training's rmse: 2.27165	valid_1's rmse: 2.27747
[400]	training's rmse: 2.25811	valid_1's rmse: 2.29594
[450]	training's rmse: 2.24431	valid_1's rmse: 2.33312
[500]	training's rmse: 2.23359	valid_1's rmse: 2.40463
[550]	training's rmse: 2.2228	valid_1's rmse: 2.40661
[600]	training's rmse: 2.21464	valid_1's rmse: 2.41639
[650]	training's rmse: 2.20589	valid_1's rmse: 2.42957
[700]	training's rmse: 2.19847	valid_1's rmse: 2.42907
[750]	training's rmse: 2.19424	valid_1's rmse: 2.42919
[800]	training's rmse: 2.18835	valid_1's rmse: 2.42479
[850]	training's rmse: 2.18213	valid_1's rmse: 2.42193
[900]	training's rmse: 2.17726	valid_1's rmse: 2.41866
[950]	traini

In [0]:
score_base = evaluate(lgb_timeFeats,X_valid)
fi_test(lgb_timeFeats,X_valid,score_base,test_cols=['lag_14','lag_56','rmean_14_7','rmean_56_7_2','rmean_14_28','rmean_56_28_2'])

Base WRMSSE: 1.3808371485701458
----- Permutation Features Importance -----
lag_14 0.04056
lag_56 -0.00242
rmean_14_7 0.3695
rmean_56_7_2 0.40054
rmean_14_28 0.45123
rmean_56_28_2 0.0


In [0]:
def train_model(train_data,valid_data):
    params = {"objective" : "tweedie",
              "metric" :"rmse",
              "seed" : 666,
              "force_row_wise" : True,
              "learning_rate" : 0.075,
              "sub_feature" : 0.7,
              "sub_row" : 0.75,
              "bagging_freq" : 1,
              "lambda_l2" : 0.3,
              "nthread": 8,
              "tweedie_variance_power":1.2,
              'verbosity': 1,
              'num_iterations' : 1500,
              'num_leaves': 128,
              "min_data_in_leaf": 104,
              'early_stopping_rounds' : 100
              }

    m_lgb = lgb.train(params, train_data, valid_sets = [train_data, valid_data], verbose_eval=50)

    return m_lgb

In [0]:
sale_data = create_feature(sale_data)

TRAIN_COLS = sale_data.columns[~sale_data.columns.isin(USELESS_COLS)]

X_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS]
y_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TARGET]

sale_data.dropna(inplace=True)

X_train = sale_data.loc[sale_data.date <= '2016-03-27',TRAIN_COLS]
y_train = sale_data.loc[sale_data.date <= '2016-03-27',TARGET]
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=CAT_COLS, free_raw_data=True)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=CAT_COLS, free_raw_data=True)

del sale_data
gc.collect()

# 建模
lgb_timeFeats_v2 = train_model(train_data,valid_data)

Training until validation scores don't improve for 100 rounds.
[50]	training's rmse: 2.47594	valid_1's rmse: 2.28678
[100]	training's rmse: 2.39835	valid_1's rmse: 2.2293
[150]	training's rmse: 2.36507	valid_1's rmse: 2.23695
[200]	training's rmse: 2.33503	valid_1's rmse: 2.2468
Early stopping, best iteration is:
[109]	training's rmse: 2.39141	valid_1's rmse: 2.21778


In [0]:
score_timeFeats_v2 = evaluate(lgb_timeFeats_v2,X_valid)
print('WRMSSE OF ENCODED MODEL:',np.round(score_timeFeats_v2,5))

WRMSSE OF ENCODED MODEL: 0.68308


In [0]:
def create_feature(sale_data, is_train=True, day=None):
    # 可以在这里加入更多的特征抽取方法
    # 获取7天前的数据，28天前的数据
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]

    # 如果是测试集只需要计算一天的特征，减少计算量
    # 注意训练集和测试集特征生成要一致
    if is_train:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data[lag_col] = sale_data[["id","sales"]].groupby("id")["sales"].shift(lag)
    else:
        for lag, lag_col in zip(lags, lag_cols):
            sale_data.loc[sale_data.date == day, lag_col] = sale_data.loc[sale_data.date ==day-timedelta(days=lag), 'sales'].values
            # day变量是指需要预测的日期。这里直接取需要预测的日期对应的lag的数据，而没有用shift函数，减小了计算量（不用冗余地处理非预测日期的数据）  


    # 将获取7天前的数据，28天前的数据做移动平均
    wins = [7, 28]

    if is_train:
        for win in wins :
            for lag,lag_col in zip(lags, lag_cols):
                # sale_data = pd.concat([sale_data,
                #                   sale_data[['id', lag_col]].groupby('id')[lag_col].apply(lambda x: x.rolling(win).agg({
                #                                             f"rmean_{lag}_{win}":'mean',
                #                                             f"rmedian_{lag}_{win}":'median',
                #                                             f"rstd_{lag}_{win}":'std',
                #                                             f"rmax_{lag}_{win}":'max',
                #                                             f"rmin_{lag}_{win}":'min'}))], axis=1)
                sale_data[f"rmean_{lag}_{win}"] = sale_data[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()).astype('float32')
                if lag == 28:
                    sale_data[f"rmedian_{lag}_{win}"] = sale_data[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).median()).astype('float32')
                    sale_data[f"rstd_{lag}_{win}"] = sale_data[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).std()).astype('float32')
                    sale_data[f"rmax_{lag}_{win}"] = sale_data[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).max()).astype('float32')
                else:
                    pass
    else:
        for win in wins:
            for lag in lags:
                # 取lag天前，窗口大小为win的日期的数据【i.e.取两个时间节点间的数据】
                df_window = sale_data[(sale_data.date <= day-timedelta(days=lag)) & (sale_data.date > day-timedelta(days=lag+win))]
                if lag == 28:
                    # 将数据按id聚合
                    df_window_grouped = df_window.groupby("id").agg({'sales':['mean','median','std','max']})
                    df_window_grouped = df_window_grouped.reindex(sale_data.loc[sale_data.date==day,'id'])['sales']
                    sale_data[[
                            f"rmean_{lag}_{win}",
                            f"rmedian_{lag}_{win}",
                            f"rstd_{lag}_{win}",
                            f"rmax_{lag}_{win}",
                            ]] = df_window_grouped.set_index(sale_data.loc[sale_data.date == day].index).astype('float32')
                else:
                    df_window_grouped = df_window.groupby("id").agg({'sales':'mean'})
                    df_window_grouped = df_window_grouped.reindex(sale_data.loc[sale_data.date==day,'id'])['sales']
                    sale_data.loc[sale_data.date == day, f"rmean_{lag}_{win}"] = df_window_grouped.values

    return sale_data

In [0]:
seed_all()
sale_data = create_train_data(1800)
sale_data = create_feature(sale_data)

In [0]:
USELESS_COLS = ["id", "date", "sales", "d"]
TARGET = 'sales'
CAT_COLS = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
TRAIN_COLS = sale_data.columns[~sale_data.columns.isin(USELESS_COLS)].tolist()
X_valid = sale_data.loc[(sale_data.date <= '2016-04-24') & (sale_data.date > '2016-03-27'),TRAIN_COLS]

In [23]:
X_valid.columns

Index(['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'wday', 'month',
       'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'week', 'quarter',
       'mday', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7', 'rmedian_28_7',
       'rstd_28_7', 'rmax_28_7', 'rmean_7_28', 'rmean_28_28', 'rmedian_28_28',
       'rstd_28_28', 'rmax_28_28'],
      dtype='object')

In [0]:
lgb_v6 = lgb.Booster(model_file='lgb_v6.txt')
score_v6 = evaluate(lgb_v6,X_valid)
fi_test(lgb_v6,X_valid,score_v6,X_valid.columns.tolist()[-12:])