In [1]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, gc, time, warnings, pickle, psutil, random

from math import ceil

from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

warnings.filterwarnings('ignore')

DIR = "C:/Users/Yipeng/Contest/BDCI2023/code/"
DIR_DATA_RAW = DIR + "data/"
DIR_DATA_PRE = DIR + "data/preprocessed/"
DIR_MODEL = DIR + "model/"
DIR_MODEL_LGB = DIR_MODEL + "lgb/"

In [2]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [3]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    df = df[df['d']>=START_TRAIN]
    
    df = df[df['store_id']==store]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2
    
    df = pd.concat([df, df3], axis=1)
    del df3

    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df.reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(processed_data_dir+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

# 1. 加载数据

In [4]:
import lightgbm as lgb

data_merged = pd.read_pickle(DIR_DATA_PRE + "data_merged.pkl")
data_merged.columns

Index(['store_id', 'sku_id', 'date', 'original_price', 'price_max',
       'price_min', 'price_mean', 'price_std', 'price_median', 'price_skew',
       ...
       'sales_21_pw_mean_change', 'sales_30_pw_mean', 'sales_30_pw_std',
       'sales_30_pw_mean_change', 'sales_60_pw_mean', 'sales_60_pw_std',
       'sales_60_pw_mean_change', 'sales_90_pw_mean', 'sales_90_pw_std',
       'sales_90_pw_mean_change'],
      dtype='object', length=142)

In [5]:
lst = []
for day in [1,3,5,7,14,21,30,60,90]:
    lst.append(f"sales_{day}_mean")
    lst.append(f"sales_{day}_std")
    lst.append(f"sales_{day}_mean_change")
data_merged.drop(lst, axis=1, inplace=True)

In [6]:
X_train = data_merged[data_merged['date'] < pd.to_datetime("2023-08-24")]
X_test = data_merged[(data_merged['date'] >= pd.to_datetime("2023-08-24")) & (data_merged['date'] < pd.to_datetime("2023-08-31"))]
y_train = X_train['quantity']
y_test = X_test['quantity']
X_train = X_train.drop(["quantity", "date"], axis=1)
X_test = X_test.drop(["quantity", "date"], axis=1)

In [7]:
del data_merged

In [9]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

def save_model_callback(model_prefix, period=100):
    def callback(env):
        if env.iteration % period == 0:
            # 保存每轮模型
            model_name = f"{model_prefix}_iteration_{env.iteration}.txt"
            env.model.save_model(model_name)
    return callback

def create_custom_obj_func(original_price):
    def custom_obj_func(preds, train_data):
        labels = train_data.get_label()
        grad = (preds - labels)
        hess = np.ones(len(labels))
        return grad, hess
    return custom_obj_func

# 在创建自定义目标函数时传入额外数据
custom_obj = create_custom_obj_func(X_train['original_price'].values)


lgb_params = {
    'boosting_type': 'gbdt',
    'objective': custom_obj,
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.015,
    'num_leaves': 2**11-1,
    'min_data_in_leaf': 2**12-1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 3000,
    'boost_from_average': False,
    'verbose': -1,
    # 'device': 'gpu'
} 

from lightgbm import log_evaluation, early_stopping

seed_everything(23)

callbacks = [log_evaluation(period=1), early_stopping(stopping_rounds=30), save_model_callback(DIR_MODEL_LGB + 'all', period=100)]

model = lgb.train(lgb_params, train_data, valid_sets=[train_data, test_data], callbacks=callbacks)

[LightGBM] [Info] Using self-defined objective function
[1]	training's rmse: 8.04867	valid_1's rmse: 9.75617
Training until validation scores don't improve for 30 rounds
[2]	training's rmse: 7.99378	valid_1's rmse: 9.68015
[3]	training's rmse: 7.94008	valid_1's rmse: 9.60631
[4]	training's rmse: 7.88805	valid_1's rmse: 9.53455
[5]	training's rmse: 7.83746	valid_1's rmse: 9.47166
[6]	training's rmse: 7.78673	valid_1's rmse: 9.40339
[7]	training's rmse: 7.73792	valid_1's rmse: 9.33837
[8]	training's rmse: 7.6906	valid_1's rmse: 9.27083
[9]	training's rmse: 7.64351	valid_1's rmse: 9.20249
[10]	training's rmse: 7.59779	valid_1's rmse: 9.13907
[11]	training's rmse: 7.55222	valid_1's rmse: 9.07686
[12]	training's rmse: 7.50775	valid_1's rmse: 9.01816
[13]	training's rmse: 7.46456	valid_1's rmse: 8.95393
[14]	training's rmse: 7.42198	valid_1's rmse: 8.89244
[15]	training's rmse: 7.38161	valid_1's rmse: 8.83834
[16]	training's rmse: 7.34453	valid_1's rmse: 8.79311
[17]	training's rmse: 7.30463

Exception ignored on calling ctypes callback function: <function _log_callback at 0x0000021D9E84AF70>
Traceback (most recent call last):
  File "c:\Users\Yipeng\anaconda3\envs\p8_tf214\lib\site-packages\lightgbm\basic.py", line 203, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[181]	training's rmse: 5.48746	valid_1's rmse: 5.62487
[182]	training's rmse: 5.48569	valid_1's rmse: 5.62125


In [None]:
model.save_model(DIR_MODEL_LGB + 'all.txt')

<lightgbm.basic.Booster at 0x1d7c0ca5280>