In [11]:
import pandas as pd
import numpy as np
from datetime import timedelta
from electricity_price_predictor.data import get_shifted_load, get_shifted_price

## Function to retrieve weather data

In [5]:
def get_weather(path='../raw_data/weather_2015_2020.csv'):
    df = pd.read_csv(path)
    
    df['dt'] = pd.to_datetime(df.dt)
    
    # drop unnecessary columns
    to_drop = ['dt_iso','timezone','lat', 'lon','sea_level','grnd_level',
               'rain_1h','rain_3h', 'pressure', 'snow_1h', 'snow_3h', 
               'temp_min','temp_max','weather_id', 'weather_description', 
               'weather_icon']
    df = df.drop(to_drop, axis=1)
    
    # population of each city in the df 
    pop = {'Aarhus': 349_983,
        'Odense': 204_895,
        'Aalborg': 217_075,
        'Esbjerg': 115_748,
        'Vejle': 111_743,
        'Randers': 96_559,
        'Viborg': 93_819,
        'Kolding': 89_412,
        'Silkeborg': 89_328,
        'Herning': 86_348,
        'Horsens': 83_598}
    
    df['population'] = [pop[city] for city in df.city_name]
    
    # numeric weather values as affects demand or supply
    numeric_cols = ['temp', 'feels_like', 'humidity',  'clouds_all','wind_speed', 'wind_deg']
    
    weather_df = pd.DataFrame()
    
    #for the numeric columns, group by datetime and average according to their population weight
    for col in numeric_cols:
    #group by the datecolumn for each element in the column average it by it's weight
        weather_df[col] = df.groupby(df.dt).apply(lambda x : np.average(x[col], weights=x.population))
        
    
    # check for missing indices
    missing_idx = pd.date_range(start = '2015-01-01', end = '2020-11-24', freq='H' ).difference(weather_df.index)
    
    # impute missing indices with average of bounding rows
    for idx in missing_idx:
        weather_df.loc[idx] = weather_df.loc[pd.to_datetime(idx) - timedelta(hours= 1)] + \
                      weather_df.loc[pd.to_datetime(idx) + timedelta(hours= 1)] / 2 
    
    return weather_df

In [6]:
df = get_weather()

In [7]:
df.head()

Unnamed: 0_level_0,temp,feels_like,humidity,clouds_all,wind_speed,wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01 00:00:00,5.432478,-0.154827,86.7587,83.913979,5.937703,240.418412
2015-01-01 01:00:00,5.488014,0.354168,87.139723,91.950524,5.322343,234.663941
2015-01-01 02:00:00,5.599633,0.307204,87.786899,89.471946,5.605773,237.303816
2015-01-01 03:00:00,6.23438,0.99153,87.241639,88.896068,5.681109,237.16409
2015-01-01 04:00:00,6.305504,1.011242,88.078906,92.107249,5.809829,236.33376


## Test after package

In [2]:
from electricity_price_predictor.data import get_weather

In [1]:
import pandas as pd

In [7]:
df_past

Unnamed: 0,dt,temp,humidity,wind_speed
0,2020-11-25 01:00:00,9.218136,88.541674,4.046357
1,2020-11-25 02:00:00,8.845358,89.452965,3.966888
2,2020-11-25 03:00:00,8.294952,90.971981,4.041120
3,2020-11-25 04:00:00,7.593209,91.272988,3.463064
4,2020-11-25 05:00:00,7.004944,93.699781,3.255737
...,...,...,...,...
115,2020-11-29 20:00:00,0.718468,89.993295,0.993587
116,2020-11-29 21:00:00,0.671744,89.882188,0.872799
117,2020-11-29 22:00:00,0.667119,89.610830,1.440935
118,2020-11-29 23:00:00,0.484776,89.635635,1.408570


In [6]:
df_past = pd.read_csv('../raw_data/past_weather.csv')

In [14]:
df = get_weather()

In [18]:
df = df.drop(columns=['feels_like', 'clouds_all' ])

In [15]:
df_past

Unnamed: 0,dt,temp,humidity,wind_speed
0,2020-11-25 01:00:00,9.218136,88.541674,4.046357
1,2020-11-25 02:00:00,8.845358,89.452965,3.966888
2,2020-11-25 03:00:00,8.294952,90.971981,4.041120
3,2020-11-25 04:00:00,7.593209,91.272988,3.463064
4,2020-11-25 05:00:00,7.004944,93.699781,3.255737
...,...,...,...,...
115,2020-11-29 20:00:00,0.718468,89.993295,0.993587
116,2020-11-29 21:00:00,0.671744,89.882188,0.872799
117,2020-11-29 22:00:00,0.667119,89.610830,1.440935
118,2020-11-29 23:00:00,0.484776,89.635635,1.408570


In [22]:
df_past['dt'] = pd.to_datetime(df_past.dt)
df_past = df_past.set_index('dt')

In [25]:
trial = pd.concat([df, df_past])

In [26]:
pd.date_range(start = '2015-01-01', end = '2020-11-29', freq='H' ).difference(trial.index)

DatetimeIndex(['2020-11-25'], dtype='datetime64[ns]', freq=None)

In [31]:
def get_weather(path='../raw_data/weather_2015_2020.csv'):
    df = pd.read_csv(path)

    df['dt'] = pd.to_datetime(df.dt)

    # drop unnecessary columns
    to_drop = ['dt_iso','timezone','lat', 'lon','sea_level','grnd_level',
               'rain_1h','rain_3h', 'pressure', 'snow_1h', 'snow_3h',
               'temp_min','temp_max','weather_id', 'weather_description',
               'weather_icon', 'wind_deg', 'feels_like','clouds_all']
    df = df.drop(to_drop, axis=1)

    # population of each city in the df
    pop = {'Aarhus': 349_983,
        'Odense': 204_895,
        'Aalborg': 217_075,
        'Esbjerg': 115_748,
        'Vejle': 111_743,
        'Randers': 96_559,
        'Viborg': 93_819,
        'Kolding': 89_412,
        'Silkeborg': 89_328,
        'Herning': 86_348,
        'Horsens': 83_598}

    df['population'] = [pop[city] for city in df.city_name]

    # numeric weather values as affects demand or supply
    numeric_cols = ['temp', 'humidity', 'wind_speed']

    weather_df = pd.DataFrame()

    #for the numeric columns, group by datetime and average according to their population weight
    for col in numeric_cols:
    #group by the datecolumn for each element in the column average it by it's weight
        weather_df[col] = df.groupby(df.dt).apply(lambda x : np.average(x[col], weights=x.population))
        
    
    # 25 - 30 nov
    df_past = pd.read_csv('../raw_data/past_weather.csv')
    df_past['dt'] = pd.to_datetime(df_past.dt)
    df_past = df_past.set_index('dt')
    
    #concat data
    weather_df = pd.concat([weather_df, df_past])

    # check for missing indices
    missing_idx = pd.date_range(start = '2015-01-01', end = '2020-11-29', freq='H' ).difference(weather_df.index)

    # impute missing indices with average of bounding rows
    for idx in missing_idx:
        weather_df.loc[idx] = weather_df.loc[pd.to_datetime(idx) - timedelta(hours= 1)] + \
                      weather_df.loc[pd.to_datetime(idx) + timedelta(hours= 1)] / 2

    weather_df = weather_df.sort_index()

    return weather_df

In [32]:
test = get_weather_2()

In [33]:
pd.date_range(start = '2015-01-01', end = '2020-11-29', freq='H' ).difference(test.index)

DatetimeIndex([], dtype='datetime64[ns]', freq=None)