In [1]:
import os
import sys
import numpy as np
import pandas as pd
import lightgbm as lgb
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import gc

## 数据读取

In [2]:
train = pd.read_csv('../input/2021-sd-power/train.csv')
test = pd.read_csv('../input/2021-sd-power/test.csv')
submit = pd.read_csv('../input/2021-sd-power/submit_example.csv')
test.head()

Unnamed: 0,create_time,id,minute
0,2021-06-06 00:00:00,0,0
1,2021-06-06 01:00:00,0,0
2,2021-06-06 02:00:00,0,0
3,2021-06-06 03:00:00,0,0
4,2021-06-06 04:00:00,0,0


In [3]:
stations = pd.read_csv('../input/2021-sd-power/btz_mx.csv')
stations.head()

Unnamed: 0,变电站id,地区id,变压器电压等级,母线id,变压器低压侧电压等级
0,0,7,1,0,0
1,0,7,1,1,0
2,1,15,1,7,2
3,1,15,1,8,2
4,2,15,1,9,2


In [4]:
train = pd.merge(train,stations,on='母线id',how='left')
test['母线id']=test['id']
test = test.drop('id',axis=1)
test = pd.merge(test,stations,on='母线id',how='left')

train.head()

Unnamed: 0,v00,v05,v10,v15,v20,v25,v30,v35,v40,v45,v50,v55,create_time,母线id,变电站id,地区id,变压器电压等级,变压器低压侧电压等级
0,0.065512,0.064907,0.061876,0.061266,0.064907,0.063086,0.068547,0.066122,0.066727,0.060661,0.060055,0.066122,2019-01-01 00:00:00,0,0,7,1,0
1,0.060661,0.066727,0.060661,0.067942,0.062481,0.064301,0.05702,0.063086,0.064907,0.065512,0.065512,0.066122,2019-01-01 01:00:00,0,0,7,1,0
2,0.065512,0.066122,0.063086,0.063696,0.063086,0.063086,0.062481,0.063086,0.064907,0.063086,0.064301,0.060661,2019-01-01 02:00:00,0,0,7,1,0
3,0.063696,0.064301,0.065512,0.063696,0.067332,0.065512,0.061266,0.058235,0.063086,0.062481,0.062481,0.064907,2019-01-01 03:00:00,0,0,7,1,0
4,0.066122,0.065512,0.067332,0.063086,0.063086,0.066122,0.064301,0.060661,0.060661,0.066122,0.065512,0.061876,2019-01-01 04:00:00,0,0,7,1,0


# 气象特征提取

In [5]:
weather = pd.read_csv('../input/2021-sd-power/weather.csv')
weather['日期']=weather['日期'].astype('datetime64[ns]')
def make_weather_fetures(data_df):
    weather['year']= weather['日期'].dt.year
    weather['month']= weather['日期'].dt.month
    weather['day']  = weather['日期'].dt.day
    weather['start_temp'] = weather['温度变化'].apply(lambda x: x.split('/')[0][:-1]).astype(int)
    weather['end_temp'] = weather['温度变化'].apply(lambda x: x.split('/')[1][:-1]).astype(int)
    weather['temp_diff'] =  weather['start_temp'] -  weather['end_temp']
    weather['start_wind'] = weather['风向风力变化'].apply(lambda x: x.split('/')[0])
    weather['end_wind'] = weather['风向风力变化'].apply(lambda x: x.split('/')[1])
    weather['start_weather'] = weather['天气'].apply(lambda x: x.split('/')[0])
    weather['end_weather'] = weather['天气'].apply(lambda x: x.split('/')[1])
    
    
    #label encoding
    le1 = LabelEncoder()
    le1.fit(weather['start_wind'])
    weather['start_wind'] = le1.transform(weather['start_wind'])
    weather['end_wind'] = le1.transform(weather['end_wind'])
    
    le2 = LabelEncoder()
    le2.fit(pd.concat([weather['start_weather'],weather['end_weather']]))
    weather['start_weather'] = le2.transform(weather['start_weather'])
    weather['end_weather'] = le2.transform(weather['end_weather'])
    
    return data_df.drop(['日期','温度变化','天气','风向风力变化'],axis=1)

In [6]:
weather_df = make_weather_fetures(weather)
weather_df.head()

Unnamed: 0,地区id,year,month,day,start_temp,end_temp,temp_diff,start_wind,end_wind,start_weather,end_weather
0,7,2018,10,1,23,12,11,20,20,13,13
1,7,2018,10,2,25,13,12,24,24,13,13
2,7,2018,10,3,26,13,13,24,24,13,13
3,7,2018,10,4,27,14,13,24,24,13,13
4,7,2018,10,5,27,13,14,24,24,13,13


## 时间特征提取

In [7]:
train['create_time']=train['create_time'].astype('datetime64[ns]')
test['create_time']=test['create_time'].astype('datetime64[ns]')
def make_time_fetures(data_df):
    data_df['year']= data_df['create_time'].dt.year
    data_df['month']= data_df['create_time'].dt.month
    data_df['qua']= data_df['create_time'].dt.quarter
    data_df['day']  = data_df['create_time'].dt.day
    data_df['hour'] = data_df['create_time'].dt.hour  
    data_df['weekday'] = data_df['create_time'].dt.weekday
    data_df['weekend'] = data_df['weekday'].isin([5,6]).astype(int)
    data_df=pd.get_dummies(data_df,columns=['weekday'])

    hours = np.arange(8, 18)
    data_df.loc[data_df['hour'].isin(hours) , 'daytime'] = 1
    data_df.loc[~data_df['hour'].isin(hours), 'daytime'] = 0
    return data_df.drop('create_time',axis=1)

In [8]:
train = make_time_fetures(train)
test = make_time_fetures(test)
test.shape

(189912, 20)

In [9]:
fea_cols = train.columns[12:len(train.columns)]

## 转为test对应格式

In [10]:
V00 = pd.concat([pd.DataFrame(train['v00'].values,columns=['v']),train[fea_cols]],axis=1)
V00['minute']=0
V05 = pd.concat([pd.DataFrame(train['v05'].values,columns=['v']),train[fea_cols]],axis=1)
V05['minute']=5
V10 = pd.concat([pd.DataFrame(train['v10'].values,columns=['v']),train[fea_cols]],axis=1)
V10['minute']=10
V15 = pd.concat([pd.DataFrame(train['v15'].values,columns=['v']),train[fea_cols]],axis=1)
V15['minute']=15
V20 = pd.concat([pd.DataFrame(train['v20'].values,columns=['v']),train[fea_cols]],axis=1)
V20['minute']=20
V25 = pd.concat([pd.DataFrame(train['v25'].values,columns=['v']),train[fea_cols]],axis=1)
V25['minute']=25
V30 = pd.concat([pd.DataFrame(train['v30'].values,columns=['v']),train[fea_cols]],axis=1)
V30['minute']=30
V35 = pd.concat([pd.DataFrame(train['v35'].values,columns=['v']),train[fea_cols]],axis=1)
V35['minute']=35
V40 = pd.concat([pd.DataFrame(train['v40'].values,columns=['v']),train[fea_cols]],axis=1)
V40['minute']=40
V45 = pd.concat([pd.DataFrame(train['v45'].values,columns=['v']),train[fea_cols]],axis=1)
V45['minute']=45
V50 = pd.concat([pd.DataFrame(train['v50'].values,columns=['v']),train[fea_cols]],axis=1)
V50['minute']=50
V55 = pd.concat([pd.DataFrame(train['v55'].values,columns=['v']),train[fea_cols]],axis=1)
V55['minute']=55

In [11]:
train_df = pd.concat([V00,V05,V10,V15,V20,V25,V30,V35,V40,V45,V50,V55])
train_df

Unnamed: 0,v,母线id,变电站id,地区id,变压器电压等级,变压器低压侧电压等级,year,month,qua,day,...,weekend,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,daytime,minute
0,0.065512,0,0,7,1,0,2019,1,1,1,...,0,0,1,0,0,0,0,0,0.0,0
1,0.060661,0,0,7,1,0,2019,1,1,1,...,0,0,1,0,0,0,0,0,0.0,0
2,0.065512,0,0,7,1,0,2019,1,1,1,...,0,0,1,0,0,0,0,0,0.0,0
3,0.063696,0,0,7,1,0,2019,1,1,1,...,0,0,1,0,0,0,0,0,0.0,0
4,0.066122,0,0,7,1,0,2019,1,1,1,...,0,0,1,0,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619246,0.368793,82,39,14,0,0,2021,6,2,5,...,1,0,0,0,0,0,1,0,0.0,55
1619247,0.356265,82,39,14,0,0,2021,6,2,5,...,1,0,0,0,0,0,1,0,0.0,55
1619248,0.338686,82,39,14,0,0,2021,6,2,5,...,1,0,0,0,0,0,1,0,0.0,55
1619249,0.301485,82,39,14,0,0,2021,6,2,5,...,1,0,0,0,0,0,1,0,0.0,55


In [12]:
del train,V00,V05,V10,V15,V20,V25,V30,V35,V40,V45,V50,V55
gc.collect()

51

In [13]:
train_df = pd.merge(train_df,weather_df,on=['地区id','year','month','day'],how='left')
test= pd.merge(test,weather_df,on=['地区id','year','month','day'],how='left')
test.shape

(189912, 27)

## 训练集数据选择

In [14]:
train_df = train_df[(train_df.year==2021)&(train_df.month>=6)]

In [15]:
df = train_df.append(test)
df

Unnamed: 0,v,母线id,变电站id,地区id,变压器电压等级,变压器低压侧电压等级,year,month,qua,day,...,weekday_6,daytime,minute,start_temp,end_temp,temp_diff,start_wind,end_wind,start_weather,end_weather
20253,0.065265,0,0,7,1,0,2021,6,2,1,...,0,0.0,0,30,22,8,7,7,20,4
20254,0.062469,0,0,7,1,0,2021,6,2,1,...,0,0.0,0,30,22,8,7,7,20,4
20255,0.059534,0,0,7,1,0,2021,6,2,1,...,0,0.0,0,30,22,8,7,7,20,4
20256,0.052954,0,0,7,1,0,2021,6,2,1,...,0,0.0,0,30,22,8,7,7,20,4
20257,0.059107,0,0,7,1,0,2021,6,2,1,...,0,0.0,0,30,22,8,7,7,20,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189907,,82,39,14,0,0,2021,6,2,16,...,0,0.0,55,29,23,6,6,6,4,10
189908,,82,39,14,0,0,2021,6,2,16,...,0,0.0,55,29,23,6,6,6,4,10
189909,,82,39,14,0,0,2021,6,2,16,...,0,0.0,55,29,23,6,6,6,4,10
189910,,82,39,14,0,0,2021,6,2,16,...,0,0.0,55,29,23,6,6,6,4,10


In [16]:
train_df = df[:train_df.shape[0]]
test = df[train_df.shape[0]:]

In [17]:
feat_cols = train_df.columns[1:]
feat_cols

Index(['母线id', '变电站id', '地区id', '变压器电压等级', '变压器低压侧电压等级', 'year', 'month',
       'qua', 'day', 'hour', 'weekend', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'daytime', 'minute',
       'start_temp', 'end_temp', 'temp_diff', 'start_wind', 'end_wind',
       'start_weather', 'end_weather'],
      dtype='object')

## LGBM

In [18]:
cat_cols = ['母线id','变电站id','地区id','变压器电压等级','变压器低压侧电压等级','start_wind','end_wind','start_weather','end_weather','daytime']
train_df= train_df[~train_df['v'].isna()]

In [19]:
from sklearn.metrics import *
import gc
import time

NFOLD = 5
KF = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=42)
params_lgb = {
    'boosting':'gbdt',
    'objective':'regression',
    'metric':'rmse',
    'random_state':42,
    'learning_rate':0.1,
    'colsample_bytree':0.8, 
    'num_leaves':63,
    'n_jobs':-1,
    'verbose': -1,
}
ycol='v'
oof_lgb = np.zeros(len(train_df))
predictions_lgb = np.zeros(len(test))
categorical_feature = cat_cols
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train_df[feat_cols], train_df['变电站id'])):
    print('*'*15,'fold {}'.format(str(fold_+1)),15*'*')
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][feat_cols], label=train_df.iloc[trn_idx][ycol])
    val_data = lgb.Dataset(train_df.iloc[val_idx][feat_cols], label=train_df.iloc[val_idx][ycol], reference=trn_data)

    clf_lgb = lgb.train(
        params=params_lgb,
        train_set=trn_data,
        valid_sets=[trn_data, val_data],
        valid_names=('train', 'val'),
        num_boost_round=30000,
        categorical_feature=categorical_feature,
        callbacks=[
            lgb.early_stopping(30, first_metric_only=True),
            lgb.log_evaluation(500),
        ],
    )

    oof_lgb[val_idx] = clf_lgb.predict(train_df.iloc[val_idx][feat_cols], num_iteration=clf_lgb.best_iteration)
    predictions_lgb[:] += clf_lgb.predict(test[feat_cols], num_iteration=clf_lgb.best_iteration) / NFOLD
    
print('LGB Model Score:',mean_absolute_error(oof_lgb,train_df.v.values)*100)

*************** fold 1 ***************


New categorical_feature is ['daytime', 'end_weather', 'end_wind', 'start_weather', 'start_wind', '变压器低压侧电压等级', '变压器电压等级', '变电站id', '地区id', '母线id']


Training until validation scores don't improve for 30 rounds
[500]	train's rmse: 0.0159123	val's rmse: 0.0179751
[1000]	train's rmse: 0.0133414	val's rmse: 0.0164084
[1500]	train's rmse: 0.0119959	val's rmse: 0.015684
[2000]	train's rmse: 0.0109975	val's rmse: 0.0152534
[2500]	train's rmse: 0.0102869	val's rmse: 0.0149756
[3000]	train's rmse: 0.00964328	val's rmse: 0.0147475
[3500]	train's rmse: 0.00915024	val's rmse: 0.014597
[4000]	train's rmse: 0.00871938	val's rmse: 0.0144614
[4500]	train's rmse: 0.00836698	val's rmse: 0.0143589
Early stopping, best iteration is:
[4665]	train's rmse: 0.00823628	val's rmse: 0.0143264
Evaluated only: rmse
*************** fold 2 ***************
Training until validation scores don't improve for 30 rounds
[500]	train's rmse: 0.0160687	val's rmse: 0.017978
[1000]	train's rmse: 0.0134653	val's rmse: 0.0162254
[1500]	train's rmse: 0.0119369	val's rmse: 0.0153586
[2000]	train's rmse: 0.0109798	val's rmse: 0.0149267
[2500]	train's rmse: 0.0102552	val's rmse

In [20]:
submit = pd.read_csv('../input/2021-sd-power/submit_example.csv')
submit['v']=predictions_lgb
submit.v[submit['v']<0]=0
submit.to_csv('lgb_baseline.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
