In [26]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, gc, time, warnings, pickle, psutil, random

from math import ceil

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

warnings.filterwarnings('ignore')

DIR = "C:/Users/Yipeng/Contest/BDCI2023/code/"
DIR_DATA_RAW = DIR + "data/"
DIR_DATA_PRE = DIR + "data/preprocessed/"
DIR_MODEL = DIR + "model/"
DIR_MODEL_LGB = DIR_MODEL + "lgb/"

In [27]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [28]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[df['store_id']==store]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3

    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [29]:
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping

data_merged = pd.read_pickle(DIR_DATA_PRE + "data_merged.pkl")
# data_merged.columns

In [30]:
lst = []
for day in [1,3,5,7,14,21,30,60,90]:
    lst.append(f"sales_{day}_mean")
    lst.append(f"sales_{day}_std")
    lst.append(f"sales_{day}_mean_change")
data_merged.drop(lst, axis=1, inplace=True)

In [35]:
def save_model_callback(model_prefix, period=100):
    def callback(env):
        if env.iteration % period == 0:
            # 保存每轮模型
            model_name = f"{model_prefix}_iteration_{env.iteration}.txt"
            env.model.save_model(model_name)
    return callback


def train(store_id: int = -1):
    if store_id != -1:
        data_store = data_merged[data_merged['store_id'] == store_id]
    else:
        data_store = data_merged
    X_train = data_store[(data_merged['date'] >= pd.to_datetime("2022-08-31")) & (data_merged['date'] < pd.to_datetime("2023-08-30"))]
    X_test = data_store[(data_merged['date'] >= pd.to_datetime("2023-08-30")) & (data_merged['date'] < pd.to_datetime("2023-08-31"))]
    y_train = X_train['quantity']
    y_test = X_test['quantity']
    X_train = X_train.drop(["quantity", "date"], axis=1)
    X_test = X_test.drop(["quantity", "date"], axis=1)
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)

    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.015,
        'num_leaves': 2**11-1,
        'min_data_in_leaf': 2**12-1,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'n_estimators': 3000,
        'boost_from_average': False,
        'verbose': -1,
        # 'device': 'gpu'
    } 
    
    seed_everything(23)
    
    callbacks = [log_evaluation(period=1), early_stopping(stopping_rounds=30), save_model_callback(DIR_MODEL_LGB + f'partial_365', period=100)]

    model = lgb.train(lgb_params, train_data, valid_sets=[train_data, test_data], callbacks=callbacks) # valid_sets=[train_data, test_data]

    model.save_model(DIR_MODEL_LGB + f'store_{store_id}_final.txt')


In [36]:
# for store_id in range(1, 13):
#     train(store_id)
train()

[1]	training's rmse: 7.46176	valid_1's rmse: 8.59548
Training until validation scores don't improve for 30 rounds
[2]	training's rmse: 7.44548	valid_1's rmse: 8.57753
[3]	training's rmse: 7.42829	valid_1's rmse: 8.55814
[4]	training's rmse: 7.41034	valid_1's rmse: 8.5369
[5]	training's rmse: 7.39145	valid_1's rmse: 8.51506
[6]	training's rmse: 7.37215	valid_1's rmse: 8.49285
[7]	training's rmse: 7.35188	valid_1's rmse: 8.46841
[8]	training's rmse: 7.33079	valid_1's rmse: 8.44248
[9]	training's rmse: 7.30952	valid_1's rmse: 8.41672
[10]	training's rmse: 7.28733	valid_1's rmse: 8.38826
[11]	training's rmse: 7.26449	valid_1's rmse: 8.35845
[12]	training's rmse: 7.24084	valid_1's rmse: 8.32795
[13]	training's rmse: 7.21685	valid_1's rmse: 8.29691
[14]	training's rmse: 7.19229	valid_1's rmse: 8.26483
[15]	training's rmse: 7.16719	valid_1's rmse: 8.2316
[16]	training's rmse: 7.1419	valid_1's rmse: 8.19768
[17]	training's rmse: 7.11564	valid_1's rmse: 8.1608
[18]	training's rmse: 7.08933	vali

KeyboardInterrupt: 