In [1]:
%pip install catboost xgboost lightgbm

Looking in indexes: https://mirrors.cernet.edu.cn/pypi/web/simple
Collecting catboost
  Downloading https://mirrors.jlu.edu.cn/pypi/web/packages/e8/37/3afd3c02798734efcd7840bfa872d3efc06f5d5c92f9613fea3ff5b4311f/catboost-1.2.3-cp311-cp311-win_amd64.whl (101.1 MB)
     ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
      -------------------------------------- 1.8/101.1 MB 39.0 MB/s eta 0:00:03
     -- ------------------------------------ 5.3/101.1 MB 56.4 MB/s eta 0:00:02
     --- ----------------------------------- 8.9/101.1 MB 63.1 MB/s eta 0:00:02
     ---- --------------------------------- 11.2/101.1 MB 65.6 MB/s eta 0:00:02
     ----- -------------------------------- 14.5/101.1 MB 65.6 MB/s eta 0:00:02
     ------ ------------------------------- 17.4/101.1 MB 65.2 MB/s eta 0:00:02
     ------- ------------------------------ 20.1/101.1 MB 59.5 MB/s eta 0:00:02
     -------- ----------------------------- 23.5/101.1 MB 65.6 MB/s eta 0:00:02
     --------- --------

In [39]:
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from xgboost import XGBRegressor

# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols = []
    # input sequence (t-n, ... t-1) for the first column
    for i in range(n_in, 0, -1):
        cols.append(df.iloc[:, 0].shift(i))
    # forecast sequence (t, t+1, ... t+n) for the first column
    for i in range(0, n_out):
        cols.append(df.iloc[:, 0].shift(-i))
    # put it all together
    agg = concat(cols, axis=1)
    # 添加剩余的列
    agg = concat([df.iloc[:, 1:], agg], axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values


# load the dataset
series = read_csv('data/Area1_Weather_Load_copy.csv', header=0, index_col=0)
values = series.values
# transform the time series data into supervised learning
data = series_to_supervised(values, n_in=24)
print(data.shape)
# 输出第一个样本
print(data[0])


(26472, 30)
[  20.         13.         16.         59.          0.       3967.259968
 3563.567968 3283.864576 3101.586496 2972.481952 2872.967488 2825.230432
 2721.898528 2674.310752 2940.903712 3182.945824 3328.061152 3179.526496
 2905.11856  2820.243232 2820.690112 2944.5592   3173.022304 3635.882464
 3777.948928 3690.874336 3705.40864  3621.380512 3257.207776 2755.238848]


In [43]:
from math import sqrt
from sklearn.metrics import mean_squared_error

m_all = len(data) # 数据集总行数
m_val = int(0.2*m_all) # 验证集数量
m_test = int(0.2*m_all) # 测试集数量
m_train = m_all - m_test - m_val # 训练集数量
# split into input and output columns
trainX, trainy = train[:m_train, :-1], train[:m_train, -1]
testX, testy = train[m_train:m_train+m_test, :-1], train[m_train:m_train+m_test, -1]
valX, valy = train[m_train+m_test:, :-1], train[m_train+m_test:, -1]
print(trainX.shape, trainy.shape)
# fit model
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
model.fit(trainX, trainy)
# 训练模型
# make a prediction
yhat = model.predict(trainX)


(15884, 29) (15884,)
Test RMSE: 161.577
Validation RMSE: 360.874


In [None]:
import os
import shutil
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error
import xgboost as xgb
import lightgbm as lgb
import tqdm, sys, os, gc, argparse, warnings
warnings.filterwarnings('ignore')

In [2]:

import pandas as pd

def normalize_data(df):
    return (df - df.mean()) / df.std()

# 读取训练集和验证集的数据
df = pd.read_csv('data/Area1_Weather_Load_copy.csv')
df = df.drop('Time', axis=1)

# 选取训练集和验证集的数据
df = normalize_data(df)
m_all = len(df) # 数据集总行数
m_val = int(0.2*m_all) # 验证集数量
m_test = int(0.2*m_all) # 测试集数量
m_train = m_all - m_test - m_val # 训练集数量

train_df = df.iloc[:m_train]
val_df = df.iloc[m_train:m_train+m_val]
test_df = df.iloc[m_train+m_val:]
print(train_df.shape, val_df.shape, test_df.shape)
print(train_df.head())

(15928, 6) (5308, 6) (5308, 6)
       Load  Max_Temperature  Min_Temperature  Avg_Temperature  Avg_Humidity  \
0 -1.410233        -1.080341        -1.264894        -1.122562     -0.815403   
1 -1.603932        -1.080341        -1.264894        -1.122562     -0.815403   
2 -1.738138        -1.080341        -1.264894        -1.122562     -0.815403   
3 -1.825598        -1.080341        -1.264894        -1.122562     -0.815403   
4 -1.887545        -1.080341        -1.264894        -1.122562     -0.815403   

   Rainfall  
0 -0.344987  
1 -0.344987  
2 -0.344987  
3 -0.344987  
4 -0.344987  


In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2024):
    folds = 5
    kf = KFold(n_splits=folds, shuffle=False, random_state=seed)
    oof = np.zeros([train_x.shape[0], 3])
    test_predict = np.zeros([test_x.shape[0], 3])
    cv_scores = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class':3,
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2023,
                'nthread' : -1,
                'verbose' : -1,
                'min_gain_to_split': 10,
                'early_stopping_round': 50,
            }
            model = clf.train(params, train_matrix, 10, valid_sets=[train_matrix, valid_matrix],
                              categorical_feature=[])
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        if clf_name == "xgb":
            xgb_params = {
              'booster': 'gbtree', 
              'objective': 'multi:softprob',
              'num_class':3,
              'max_depth': 5,
              'lambda': 20,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.05,
              'tree_method': 'hist',
              'seed': 520,
              'nthread': -1,
              'tree_method': 'gpu_hist',
              }
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist)
            val_pred  = model.predict(valid_matrix)
            test_pred = model.predict(test_matrix)
            
        if clf_name == "cat":
            params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2023,
                      'od_type': 'Iter', 'od_wait': 100,  'allow_writing_files': False,
                      'loss_function': 'MultiClass', "task_type": device}
            
            model = clf(iterations=1000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      metric_period=50,
                      use_best_model=True, 
                      cat_features=[],
                      verbose=1)
            
            val_pred  = model.predict_proba(val_x)
            test_pred = model.predict_proba(test_x)
        
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        
        F1_score = f1_score(val_y, np.argmax(val_pred, axis=1), average='macro')
        cv_scores.append(F1_score)
        print(cv_scores)
        
    return oof, test_predict
    
# 处理train_x和test_x中的NaN值
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# 处理train_x和test_x中的Inf值
train_df = train_df.replace([np.inf, -np.inf], 0)
test_df = test_df.replace([np.inf, -np.inf], 0)

# 入模特征
cols = [f for f in test_df.columns if f not in ['uuid','time','file']]
for label in ['label_5','label_10','label_20','label_40','label_60']:
    print(f'=================== {label} ===================')
    # 选择lightgbm模型
    #lgb_oof, lgb_test = cv_model(lgb, train_df[cols], train_df[label], test_df[cols], 'lgb')
    # 选择xgboost模型
    xgb_oof, xgb_test = cv_model(xgb, train_df[cols], train_df[label], test_df[cols], 'xgb')
    # 选择catboost模型
    #cat_oof, cat_test = cv_model(CatBoostClassifier, train_df[cols], train_df[label], test_df[cols], 'cat')

    # 进行取平均融合
    #final_test = (lgb_test + xgb_test + cat_test) / 3
    final_test = xgb_test
    test_df[label] = np.argmax(final_test, axis=1)