In [1]:
dir_data = '../data'

In [2]:
import sys
sys.path.append('../')

from suncapturer.utils import standardize_time

import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [3]:
gens = pd.read_csv(os.path.join(dir_data, 'gens.csv'))
fcst = pd.read_csv(os.path.join(dir_data, 'forecasts1.csv'))
sites = pd.read_csv(os.path.join(dir_data, 'sites.csv'))

path_gens_collected = os.path.join(dir_data, 'gens_collected.csv')
if os.path.exists(path_gens_collected):
    gens_collected = pd.read_csv(path_gens_collected)
    gens = pd.concat([gens, gens_collected], axis=0, ignore_index=True)
    gens = gens.sort_values(['id', 'time']).reset_index(drop=True)
    
path_fcst_collected = os.path.join(dir_data, 'fcst_collected.csv')
if os.path.exists(path_fcst_collected):
    fcst_collected = pd.read_csv(path_fcst_collected)
    fcst = pd.concat([fcst, fcst_collected], axis=0, ignore_index=True)

# Preprocessing forecasts1.csv
fcst['fcst_time'] = fcst['fcst_time'].apply(standardize_time)
fcst['time'] = fcst['time'].apply(standardize_time)

fcst['fcst_time'] = pd.to_datetime(fcst['fcst_time'])
fcst['fcst_time'] = fcst['fcst_time'].dt.round('60min')
fcst['time'] = pd.to_datetime(fcst['time'])
fcst['forecast'] = ((fcst['time'] - fcst['fcst_time']).dt.total_seconds() / 60.0 / 60.0).astype('int')

# Preprocessing gens.csv
gens['time'] = gens['time'].apply(standardize_time)
gens['time'] = pd.to_datetime(gens['time'])

# Merging fcst and gens
fcst['id'] = fcst['id'] - 1
fcst = pd.merge(fcst, gens, on=['id', 'time'], how='inner')

# Feature engineering
# 1. Capacity
gen2capacity = sites['capacity'].to_dict()
fcst['capacity'] = fcst['id'].apply(lambda x: gen2capacity[x])

# 2. Relative amount
fcst['relative_amount'] = fcst['amount'] / fcst['capacity']

# 3. Time-related features
fcst['hour']  = fcst['time'].dt.hour
fcst['year']  = fcst['time'].dt.year
fcst['month'] = fcst['time'].dt.month
fcst['day']   = fcst['time'].dt.day

# 4. Drop-duplication
fcst = fcst.drop_duplicates().reset_index(drop=True)

In [4]:
fcst_16 = fcst[fcst['fcst_time'].dt.hour == 16].reset_index(drop=True)
fcst_16[(fcst_16['forecast'] > 8) & (fcst_16['forecast'] < 33)].to_csv('../data/forecasts1_16.csv', index=False)

In [5]:
# fcst_9 = fcst[fcst['fcst_time'].dt.hour == 9].reset_index(drop=True)
# fcst_9[(fcst_9['forecast'] > 15) & (fcst_9['forecast'] < 40)].to_csv('../data/forecasts1_9.csv', index=False)