In [1]:
#import 相关库
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import tqdm
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 读取数据
train_power_forecast_history = pd.read_csv('./训练集/power_forecast_history.csv')
train_power = pd.read_csv('./训练集/power.csv')
train_stub_info = pd.read_csv('./训练集/stub_info.csv')

test_power_forecast_history = pd.read_csv('./测试集/power_forecast_history.csv')
test_stub_info = pd.read_csv('./测试集/stub_info.csv')

In [52]:
# 聚合数据
train_df = train_power_forecast_history.groupby(['id_encode','ds']).head(1)
del train_df['hour']

test_df = test_power_forecast_history.groupby(['id_encode','ds']).head(1)
del test_df['hour']

tmp_df = train_power.groupby(['id_encode','ds'])['power'].sum()
tmp_df.columns = ['id_encode','ds','power']

# 合并充电量数据
train_df = train_df.merge(tmp_df, on=['id_encode','ds'], how='left')

### 合并数据
train_df = train_df.merge(train_stub_info, on='id_encode', how='left')
test_df = test_df.merge(test_stub_info, on='id_encode', how='left')

In [53]:
train_df['flag'] = train_df['flag'].map({'A':0,'B':1})
test_df['flag'] = test_df['flag'].map({'A':0,'B':1})

In [54]:
train_df.head()

Unnamed: 0,id_encode,ele_price,ser_price,after_ser_price,total_price,f1,f2,f3,ds,power,parking_free,flag,h3,ac_equipment_kw,dc_equipment_kw
0,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,20220415,2288.224,1.0,0,85309ea7fffffff,0.0,1440.0
1,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,20220416,2398.573,1.0,0,85309ea7fffffff,0.0,1440.0
2,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,20220417,2313.033,1.0,0,85309ea7fffffff,0.0,1440.0
3,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,20220418,2095.3259,1.0,0,85309ea7fffffff,0.0,1440.0
4,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,20220419,1834.359,1.0,0,85309ea7fffffff,0.0,1440.0


In [21]:
!pip install workalendar





In [55]:
from workalendar.asia import China
cal = China()

# 转换 ds 列为 datetime 类型
train_df['ds'] = pd.to_datetime(train_df['ds'], format='%Y%m%d')
test_df['ds'] = pd.to_datetime(test_df['ds'], format='%Y%m%d')


# 判断日期是否为工作日
train_df['is_working_day'] = train_df['ds'].apply(cal.is_working_day).astype(int)
test_df['is_working_day'] = test_df['ds'].apply(cal.is_working_day).astype(int)
train_df[['ds','is_working_day']].head()

Unnamed: 0,ds,is_working_day
0,2022-04-15,1
1,2022-04-16,0
2,2022-04-17,0
3,2022-04-18,1
4,2022-04-19,1


In [47]:
train_df['ds']

0        2022-04-15
1        2022-04-16
2        2022-04-17
3        2022-04-18
4        2022-04-19
            ...    
149039   2023-04-10
149040   2023-04-11
149041   2023-04-12
149042   2023-04-13
149043   2023-04-14
Name: ds, Length: 149044, dtype: datetime64[ns]

In [6]:
!pip install h3

Collecting h3
  Downloading h3-3.7.6-cp310-cp310-win_amd64.whl (848 kB)
     ---------------------------------------- 0.0/848.6 kB ? eta -:--:--
     ---------------------------------------- 10.2/848.6 kB ? eta -:--:--
     ---------------------------------------- 10.2/848.6 kB ? eta -:--:--
     - ----------------------------------- 30.7/848.6 kB 330.3 kB/s eta 0:00:03
     - ----------------------------------- 30.7/848.6 kB 330.3 kB/s eta 0:00:03
     - ----------------------------------- 30.7/848.6 kB 330.3 kB/s eta 0:00:03
     - ----------------------------------- 41.0/848.6 kB 164.3 kB/s eta 0:00:05
     -- ---------------------------------- 61.4/848.6 kB 233.8 kB/s eta 0:00:04
     ---- -------------------------------- 92.2/848.6 kB 309.1 kB/s eta 0:00:03
     ---- ------------------------------- 112.6/848.6 kB 364.4 kB/s eta 0:00:03
     ---- ------------------------------- 112.6/848.6 kB 364.4 kB/s eta 0:00:03
     ------ ----------------------------- 143.4/848.6 kB 355.0 kB/s



**地理位置**

In [56]:
import h3
train_df["h3"].head()
train_df['latitude'] = train_df['h3'].apply(lambda x: h3.h3_to_geo(x)[0])
train_df['longitude'] = train_df['h3'].apply(lambda x: h3.h3_to_geo(x)[1])
test_df['latitude'] = test_df['h3'].apply(lambda x: h3.h3_to_geo(x)[0])
test_df['longitude'] = test_df['h3'].apply(lambda x: h3.h3_to_geo(x)[1])

In [57]:
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    # df_copy['new_'+col] = df_copy[col].astype(str)
    # df_copy['new_'+col] = df_copy[col]
    
    # col = 'new_'+col
    # df_copy[col] = pd.to_datetime(df_copy[col], format='%Y%m%d')
    df_copy[prefix + 'year'] = df_copy[col].dt.year
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    # df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    df_copy[prefix + 'quarter'] = df_copy[col].dt.quarter
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    # del df_copy[col]
    
    return df_copy   
    
train_df = get_time_feature(train_df, 'ds')
test_df = get_time_feature(test_df, 'ds')

#cols = [f for f in test_df.columns if f not in ['ds','power','h3']]
cols = [f for f in test_df.columns if f not in ['ds','power','h3']]

In [58]:
train_df.head()

Unnamed: 0,id_encode,ele_price,ser_price,after_ser_price,total_price,f1,f2,f3,ds,power,...,latitude,longitude,ds_year,ds_month,ds_day,ds_dayofweek,ds_is_wknd,ds_quarter,ds_is_month_start,ds_is_month_end
0,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,2022-04-15,2288.224,...,31.523294,120.096637,2022,4,15,4,0,2,0,0
1,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,2022-04-16,2398.573,...,31.523294,120.096637,2022,4,16,5,0,2,0,0
2,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,2022-04-17,2313.033,...,31.523294,120.096637,2022,4,17,6,1,2,0,0
3,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,2022-04-18,2095.3259,...,31.523294,120.096637,2022,4,18,0,0,2,0,0
4,0,0.64,0.95,0.31,1.59,0.0,0.0,1.0,2022-04-19,1834.359,...,31.523294,120.096637,2022,4,19,1,0,2,0,0


In [62]:
# 使用K折交叉验证训练和验证模型
def cv_model(clf, train_x, train_y, test_x, seed=2023):
    # 定义折数并初始化KFold
    folds = 8
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    # 初始化oof预测和测试集预测
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []

    # KFold交叉验证
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        # 转换数据为lightgbm数据格式
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        # 定义lightgbm参数
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'min_child_weight': 5,
            'num_leaves': 2 ** 8,
            'lambda_l2': 10,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.01,
            'seed': 2023,
            'nthread' : 16,
            'verbose' : -1,
            'device':'gpu'
        }

        # 训练模型
        model = clf.train(params, train_matrix, 3000, valid_sets=[train_matrix, valid_matrix], categorical_feature=[])
        
        # 获取验证和测试集的预测值
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        
        # 计算并打印当前折的分数
        score = np.sqrt(mean_squared_error(val_pred, val_y))
        cv_scores.append(score)
        print(cv_scores)
        
    return oof, test_predict

# 调用上面的函数进行模型训练和预测
lgb_oof, lgb_test = cv_model(lgb, train_df[cols], train_df['power'], test_df[cols])

************************************ 1 ************************************
[276.56413742463883]
************************************ 2 ************************************
[276.56413742463883, 253.59854625223105]
************************************ 3 ************************************
[276.56413742463883, 253.59854625223105, 275.96803941523433]
************************************ 4 ************************************
[276.56413742463883, 253.59854625223105, 275.96803941523433, 267.89887244245386]
************************************ 5 ************************************
[276.56413742463883, 253.59854625223105, 275.96803941523433, 267.89887244245386, 263.271208690682]
************************************ 6 ************************************
[276.56413742463883, 253.59854625223105, 275.96803941523433, 267.89887244245386, 263.271208690682, 260.53757751004645]
************************************ 7 ************************************
[276.56413742463883, 253.59854625223105, 275.9

In [61]:
#输出赛题提交格式的结果
test_df['power'] = lgb_test
test_df['power'] = test_df['power'].apply(lambda x: 0 if x<0 else x)
test_df['ds'] = test_df['ds'].dt.strftime('%Y%m%d').astype(int)
test_df[['id_encode','ds','power']].to_csv('resultworkday.csv', index=False)