In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from multiprocessing import Pool
from fbprophet import Prophet

train_dir = './data/train'
test_dir = './data/test'

In [2]:
train = pd.read_csv(f'{train_dir}/train.csv')
train['create_time'] = pd.to_datetime(train['create_time'])
train.rename(columns={'母线id': 'id'}, inplace=True)

test = pd.read_csv(f'{test_dir}/test.csv')
test['create_time'] = pd.to_datetime(test['create_time'])

ss = pd.read_csv(f'{test_dir}/submit_example.csv')
ss['create_time'] = pd.to_datetime(ss['create_time'])

# info_data = pd.read_csv(f'{train_dir}/变电站与母线信息表.csv')
# weather_data = pd.read_csv(f'{train_dir}/天气数据.csv')

In [3]:
def run_prophet(train, mx_id, yearly=False):
    sample = train[train['id'] == mx_id].copy()
    sample['ts'] = sample["create_time"].values.astype(np.int64) // 10 ** 9
    # prepare data
    data = list()
    for _, row in sample.iterrows():
        id_ = row['id']
        ts = row['ts']
        for idx, col in enumerate(['v00','v05','v10','v15','v20','v25',
                                   'v30','v35','v40','v45','v50','v55']):
            data.append({
                'id': id_,
                'ts': ts + idx * 300,
                'y': row[col]
            })
    df = pd.DataFrame(data)
    df['ds'] = pd.to_datetime(df['ts'], unit='s')
    df.drop(['ts'], axis=1, inplace=True)
    df = df[['ds', 'y']]
    # fit
    m = Prophet(yearly_seasonality=yearly, 
                weekly_seasonality=True, 
                daily_seasonality=True,
                seasonality_mode='multiplicative',
                uncertainty_samples=0)
    m.fit(df)
    # predict
    time_delta = (pd.date_range('2021-06-16 23:55:00', periods=1, freq='5min') - df['ds'].max()) / pd.Timedelta(minutes=5)
    periods = int(time_delta.values[0])
    future = m.make_future_dataframe(freq='5min', periods=periods, include_history=False)
    forecast = m.predict(future)
    # prepare result
    ret = forecast[['ds', 'yhat']]
    ret['id'] = mx_id
    ret['minute'] = ret['ds'].dt.minute
    ret['ts'] = ret["ds"].values.astype(np.int64) // 10 ** 9
    ret['ts'] = ret['ts'] - ret['minute'] * 60
    ret['create_time'] = pd.to_datetime(ret['ts'], unit='s')
    ret = ret[['create_time', 'id', 'minute', 'yhat']]
    ret.rename(columns={'yhat': 'v'}, inplace=True)
    
    return ret

In [4]:
# 先用一个月的数据

train_df = train[train['create_time'] >= '2021-05-05 00:00:00'].copy()

In [5]:
%%time
ret = run_prophet(train_df, 0, yearly=False)

CPU times: user 20.5 s, sys: 1.53 s, total: 22 s
Wall time: 19.6 s


In [6]:
predictions = list()
for id_ in tqdm(test['id'].unique()):
    ret = run_prophet(train_df, id_, yearly=False)
    predictions.append(ret)

HBox(children=(FloatProgress(value=0.0, max=81.0), HTML(value='')))




In [12]:
df_predictions = pd.concat(predictions)
print(df_predictions.shape)
df_predictions.head(20)

(273744, 4)


Unnamed: 0,create_time,id,minute,v
0,2021-06-05 00:00:00,0,0,0.076692
1,2021-06-05 00:00:00,0,5,0.076321
2,2021-06-05 00:00:00,0,10,0.075974
3,2021-06-05 00:00:00,0,15,0.075654
4,2021-06-05 00:00:00,0,20,0.075362
5,2021-06-05 00:00:00,0,25,0.075098
6,2021-06-05 00:00:00,0,30,0.074864
7,2021-06-05 00:00:00,0,35,0.074659
8,2021-06-05 00:00:00,0,40,0.074483
9,2021-06-05 00:00:00,0,45,0.074337


In [15]:
ss.drop(['v'], axis=1, inplace=True)
print(ss.shape)
ss = ss.merge(df_predictions, on=['create_time', 'id', 'minute'], how='left')
print(ss.shape)
ss.head(20)

(189912, 3)
(189912, 4)


Unnamed: 0,create_time,id,minute,v
0,2021-06-06 00:00:00,0,0,0.058851
1,2021-06-06 01:00:00,0,0,0.05611
2,2021-06-06 02:00:00,0,0,0.056786
3,2021-06-06 03:00:00,0,0,0.0569
4,2021-06-06 04:00:00,0,0,0.053008
5,2021-06-06 05:00:00,0,0,0.047043
6,2021-06-06 06:00:00,0,0,0.045774
7,2021-06-06 07:00:00,0,0,0.054693
8,2021-06-06 08:00:00,0,0,0.072075
9,2021-06-06 09:00:00,0,0,0.089273


In [16]:
ss.to_csv('prophet_1mon_baseline.csv', index=False)