In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import catboost as cb
import time
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from matplotlib.pyplot import plot, show, title
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None) 
np.random.seed(1234)

df_train = pd.read_csv('data/A榜-训练集_海上风电预测_气象变量及实际功率数据.csv', encoding='gbk')
df_test = pd.read_csv('data/A榜-测试集_海上风电预测_气象变量数据.csv', encoding='gbk')

add_df = pd.read_csv('data/A榜-训练集_海上风电预测_基本信息.csv', encoding='gbk')
# add_df = pd.read_csv('df_train/A榜-训练集_海上风电预测_基本信息.csv')
print(df_test.columns)
df = pd.concat([df_train, df_test])
df = df.merge(add_df[['站点编号', '装机容量(MW)']], on='站点编号', how='left')
df['站点编号_le'] = df['站点编号'].map(lambda x: int(x[1]))

df['time'] = pd.to_datetime(df['时间'])
df['day'] = df['time'].dt.day
df['weekday'] = df['time'].dt.weekday
df['hour'] = df['time'].dt.hour
df['month'] = df['time'].dt.month
df['minute'] = df['time'].dt.minute

# df['站点编号'] = df['站点编号'].astype('category')
df['hour_sin'] = np.sin(df['hour'] / 23 * 2 * np.pi)
df['hour_cos'] = np.cos(df['hour'] / 23 * 2 * np.pi)
# df['minute_sin'] = np.sin(df['minute'] / 59 * 2 * np.pi)
# df['minute_cos'] = np.cos(df['minute'] / 59 * 2 * np.pi)
df['time_96'] = df.apply(lambda x: (x['hour'] * 60 + x['minute']) / 15 + 1, axis=1)

# df['num_samples'] = list(range(len(df)))


# for col in ['10米风速（10m/s）','100m风速（100m/s）']:
#     df[col+'_squ'] = df[col] ** 2
#     df[col+'_cub'] = df[col] ** 3
#     df[col+'_mean'] = df[col].mean()
#     df[col+'_max'] = df[col].max()
#     df[col+'_min'] = df[col].min()
#     df[col+'_std'] = df[col].std()
#     df[col+'_25'] = df[col].quantile(0.25)
#     df[col+'_50'] = df[col].median()
#     df[col+'_75'] = df[col].quantile(0.75)


df_fs100_mean = df['100m风速（100m/s）'].groupby([df['站点编号'],df['month'],df['day']]).mean()
df_fs10_mean = df['10米风速（10m/s）'].groupby([df['站点编号'],df['month'],df['day']]).mean()
# print(fs100_mean)
# print(fs10_mean)

merged_df = pd.merge(df, df_fs100_mean, on=['站点编号','month', 'day'])
merged_df = pd.merge(merged_df, df_fs10_mean, on=['站点编号','month', 'day'])

# merged_df = merged_df.assign(m100fs_mean=merged_df['100m风速（100m/s）_y'],m10fs_mean=merged_df['10米风速（10m/s）_y'])
# merged_df = merged_df.drop(['100m风速（100m/s）_y', '10米风速（10m/s）_y'], axis=1)
merged_df.rename(columns={'100m风速（100m/s）_x':'100m风速（100m/s）', '10米风速（10m/s）_x':'10米风速（10m/s）',
                          '100m风速（100m/s）_y':'100m风速（100m/s）_mean', '10米风速（10m/s）_y':'10米风速（10m/s）_mean'}, inplace=True)

merged_df = merged_df.sort_values(by=['站点编号','time'],ascending=[True, True])
df = merged_df.reset_index(drop=True)
print(df[0:5])

for col in ['10米风向（°)', '100m风向（°)']:
    df[col+"_sin"]=df[col].apply(lambda x: np.sin(x/180*np.pi))
    df[col+"_cos"]=df[col].apply(lambda x: np.cos(x/180*np.pi))
    
print(f"gap feature")
gaps=[1,2,4,7,15,30,50,80]
# gaps=[1,2,4,8,16,32,48,96,192,288]
for gap in gaps:
    for col in ['10米风速（10m/s）', 
        '100m风速（100m/s）']:
        df[col+f"_shift{gap}"]=df[col].groupby(df['站点编号']).shift(gap, fill_value=0)
        df[col+f"_gap{gap}"]=df[col+f"_shift{gap}"]-df[col]
        df.drop([col+f"_shift{gap}"],axis=1,inplace=True)


# print("one hot encoder")
# for col in ['站点编号']:
#     unique_value=df[col].unique()
#     for value in unique_value:
#         df[col+"_"+str(value)]=(df[col]==value)
#     df.drop([col],axis=1,inplace=True)


LABEL = '出力(MW)'

# df_train = df[df[LABEL].notna()]
# df_test = df[df[LABEL].isna()].reset_index(drop=True)

df_train = df[df['time']< '2023-05-01 0:0:0']
df_test = df[df['time'] >= '2023-05-01 0:0:0']

print(len(df_train))
df_train = df_train[df_train[LABEL]!='<NULL>'].reset_index(drop=True)
df_train[LABEL] = df_train[LABEL].astype('float32')
print(len(df_train))

# 出力归一化
df_train[LABEL] = df_train[LABEL]/df_train['装机容量(MW)']

params_lgb = {
    'learning_rate': 0.03,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 128,
    'max_depth' : 8,
    'verbose': 1,
    'seed': 13,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
}

hyper_space = {#'n_estimators': [100, 300, 500, 800],
               'max_depth':  [4, 5, 6, 8, 10, 12],
               'num_leaves': [15, 31, 63, 127],
            #    'subsample': [0.6, 0.7, 0.8, 1.0],
               #'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
               'learning_rate' : [0.01,0.02,0.03, 0.05]
              }


sub_train_df = df_train[df_train['time'] < '2023-02-01 0:0:0']
sub_val_df = df_train[df_train['time'] >= '2023-02-01 0:0:0']


feats = [f for f in sub_train_df.columns if f not in [LABEL, '时间', 'time',  '站点编号', 'minute']]
# print(feats)

train = lgb.Dataset(sub_train_df[feats],
                    sub_train_df[LABEL])
val = lgb.Dataset(sub_val_df[feats],
                  sub_val_df[LABEL])
# model = lgb.train(params_lgb, train, valid_sets=[train, val], num_boost_round=5000, callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)])
# cv_result = lgb.cv(params_lgb, train, num_boost_round=1000, nfold=5, shuffle=True)
model = lgb.LGBMRegressor(**params_lgb)
model.fit(sub_train_df[feats], sub_train_df[LABEL])
# gs = GridSearchCV(model, hyper_space, scoring='r2', cv=4, verbose=1)
# gs_result = gs.fit(sub_train_df[feats], sub_train_df[LABEL])
val_pred = model.predict(sub_val_df[feats])
s_mse = mse(sub_val_df[LABEL]*sub_val_df['装机容量(MW)'], val_pred*sub_val_df['装机容量(MW)'], squared=False)
score = 1/(1+s_mse)
print('score... %.5f'%score, 'rmse...%.5f'%s_mse)
# print('best_paras:', str(gs_result.best_params_))
# print(cv_result)




Index(['站点编号', '时间', '气压(Pa）', '相对湿度（%）', '云量', '10米风速（10m/s）', '10米风向（°)',
       '温度（K）', '辐照强度（J/m2）', '降水（m）', '100m风速（100m/s）', '100m风向（°)'],
      dtype='object')
  站点编号             时间       气压(Pa）  相对湿度（%）        云量  10米风速（10m/s）  10米风向（°)  \
0   f1  2022-1-3 0:00  102249.6094  74.8513  0.007812        7.7041   26.5195   
1   f1  2022-1-3 0:15  102252.0355  74.7530  0.000924        7.7710   23.5766   
2   f1  2022-1-3 0:30  102248.5900  74.4995  0.003009        7.8272   21.5451   
3   f1  2022-1-3 0:45  102240.4725  74.1432  0.011402        7.8637   20.2394   
4   f1  2022-1-3 1:00  102228.8828  73.7366  0.023438        7.8781   19.4870   

      温度（K）  辐照强度（J/m2）     降水（m）  100m风速（100m/s）  100m风向（°) 出力(MW)  装机容量(MW)  \
0  286.0695         0.0  0.000008          9.0820    27.5093  17.26        48   
1  285.8647         0.0  0.000008          9.1374    24.7151  16.78        48   
2  285.6935         0.0  0.000008          9.1856    22.7445  16.25        48   
3  285.5512         

In [1]:
!pip list

Package            Version
------------------ ------------
asttokens          2.4.0
backcall           0.2.0
catboost           1.2.3
chinesecalendar    1.9.1
cnlunar            0.1.3
colorama           0.4.6
comm               0.1.4
contourpy          1.1.1
cycler             0.11.0
debugpy            1.8.0
decorator          5.1.1
executing          1.2.0
filelock           3.9.0
fonttools          4.42.1
fsspec             2023.4.0
graphviz           0.20.1
ipykernel          6.29.3
ipython            8.15.0
ipywidgets         8.1.2
jedi               0.19.0
Jinja2             3.1.2
joblib             1.3.2
jupyter_client     8.3.1
jupyter_core       5.3.1
jupyterlab_widgets 3.0.10
kiwisolver         1.4.5
lightgbm           4.3.0
MarkupSafe         2.1.3
matplotlib         3.8.0
matplotlib-inline  0.1.6
mpmath             1.3.0
nest-asyncio       1.5.8
networkx           3.2.1
numpy              1.26.0
packaging          23.1
pandas             2.1.1
parso              0.8.3
pickle


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
