In [1]:
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime, timedelta
from sklearn import linear_model as lm
pd.options.mode.chained_assignment = None

In [2]:
def _create_datetime(row):
    """Helper Function

    Parameters
    ----------
    row : TYPE
        Description
    row : pd.Series

    Returns
    -------
    pd.Datetime
    """
    date = row.Datum.strftime("%Y-%m-%d") + " " + row.Uhrzeit
    return date

In [3]:
def read_data_stromfluss(data_dir):
    """Read data from directory, get rid of '-' and NaN values, export a dataframe"""

    dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
    numberparse = lambda x: pd.np.float(x.replace(".", "").replace(",", ".")) if x != "-" else np.nan
    convert_thousand = {num: numberparse for num in np.arange(2, 22)}

    import_files = os.listdir(data_dir)

    for i in range(len(import_files)):
        if i == 0:
            df = pd.read_csv(os.path.join(data_dir, import_files[i]), sep=';', decimal=',', thousands='.',
                             parse_dates=['Datum'], date_parser=dateparse, converters=convert_thousand)
        else:
            df = df.append(pd.read_csv(os.path.join(data_dir, import_files[i]), sep=';', decimal=',', thousands='.',
                                       parse_dates=['Datum'], date_parser=dateparse, converters=convert_thousand))

    df.replace('-', 0, inplace=True)
    df.fillna(0, inplace=True)

    return df

In [4]:
def preprocessing_stromfluss(df, basic = False):
    """Preprocessing für Stromfluss Datansatz von Smard

    Parameters
    ----------
    df : pd.DataFrame
        Stromfluss Datensatz von SMARD eingelesen und unbearbeitet

    Returns
    -------
    pd.DataFrame
        Stromfluss Datensatz von SMARD aufbereitet zur weiteren Verwendung
    """

    # Time Formatting
    df["Date"] = df.apply(lambda row: _create_datetime(row), axis=1)
    df["Date"] = pd.to_datetime(df.Date, format="%Y-%m-%d %H:%M")
    df = df.sort_values("Date")

    cols = list(df)
    cols.insert(0, cols.pop(cols.index("Date")))
    df = df.loc[:, cols]

    # Rename columns
    countries = {
        "Niederlande": "NL",
        "Schweiz": "CHE",
        "Dänemark": "DNK",
        "Tschechien": "CZE",
        "Luxemburg": "LUX",
        "Schweden": "SWE",
        "Österreich": "AUT",
        "Frankreich": "FRA",
        "Polen": "PL",
    }

    types = {"Import": "IM", "Export": "EX"}

    type_pattern = r"\((.*?)\)"
    country_pattern = r"(.*?) "

    df.columns = list(df.columns[0:3]) + ['NX'] + [countries.get(re.search(country_pattern, col).group(1))+ "_"+ types.get(re.search(type_pattern, col).group(1)) for col in df.columns[4::]]

    # Netto Export
    df["NX"] = df.iloc[:, 4:].sum(axis = 1)
    
    #Drop columns
    df.drop(df.columns[1:3], axis=1, inplace = True)

    #Export only Datetime and NX for basic analysis
    if basic:
        df = df[['Date', 'NX']]

    return df

In [5]:
df = read_data_stromfluss('data/stromfluss')

In [6]:
df = preprocessing_stromfluss(df, True)

In [7]:
df['Year'] = df['Date'].apply(lambda x: x.year)
df['Month'] = df['Date'].apply(lambda x: x.month)
df['Day'] = df['Date'].apply(lambda x: x.month)
df['Hour'] = df['Date'].apply(lambda x: x.hour)

In [8]:
df.head()

Unnamed: 0,Date,NX,Year,Month,Day,Hour
0,2015-06-01 00:00:00,7279.0,2015,6,6,0
1,2015-06-01 01:00:00,7167.0,2015,6,6,1
2,2015-06-01 02:00:00,6667.0,2015,6,6,2
3,2015-06-01 03:00:00,6413.0,2015,6,6,3
4,2015-06-01 04:00:00,5347.0,2015,6,6,4


In [9]:
def predict_validate(model, df, dateList):
    df['NX P'] = np.nan
    df['AE'] = np.nan
    
    if model == 'lm':
        for date in dateList:
            df = predict_lm(df, date)
            
    else:
        print('AA')
        #TBD
    
    return df

In [10]:
def predict_lm(df, date):
    #Train test split
    X = df[df['Date'] < date]
    Y = df[(df['Date'] >= date) & (df['Date'] < date + timedelta(days=1))]
    
    #Train & test
    tmp = lm.LinearRegression().fit(X[['Year', 'Month', 'Day', 'Hour']], X['NX']).predict(Y[['Year', 'Month', 'Day', 'Hour']])
    Y.at[:,'NX P'] = tmp
    #Average error
    Y.at[:,'AE'] = (Y['NX'] - Y['NX P']).apply(lambda x: abs(x))
    
    #Update df with AE information
    df.update(Y)
    
    print('Mean average error for {} is: {}'.format(date, Y['AE'].mean()))
    
    return df
    

In [11]:
dateList = list()
dateList.append(pd.to_datetime("01/01/2016"))
dateList.append(pd.to_datetime("01/06/2016"))
dateList.append(pd.to_datetime("01/01/2017"))
dateList.append(pd.to_datetime("01/06/2017"))
dateList.append(pd.to_datetime("01/01/2018"))
dateList.append(pd.to_datetime("01/06/2018"))

In [12]:
df_tmp = predict_validate('lm', df, dateList)

  linalg.lstsq(X, y)


Mean average error for 2016-01-01 00:00:00 is: 7425.934319841462
Mean average error for 2016-01-06 00:00:00 is: 1412.829451724887
Mean average error for 2017-01-01 00:00:00 is: 2891.2377567407675
Mean average error for 2017-01-06 00:00:00 is: 1006.0989583333334
Mean average error for 2018-01-01 00:00:00 is: 6536.036014853899
Mean average error for 2018-01-06 00:00:00 is: 2063.423776765548


In [13]:
df_tmp[df_tmp['AE'] > 0]

Unnamed: 0,Date,NX,Year,Month,Day,Hour,NX P,AE
5113,2018-01-01 00:00:00,14551.0,2018.0,1.0,1.0,0.0,6195.075964,8355.924036
5114,2018-01-01 01:00:00,15432.0,2018.0,1.0,1.0,1.0,6150.075964,9281.924036
5115,2018-01-01 02:00:00,15739.0,2018.0,1.0,1.0,2.0,6105.138464,9633.861536
5116,2018-01-01 03:00:00,16078.0,2018.0,1.0,1.0,3.0,6060.138464,10017.861536
5117,2018-01-01 04:00:00,15343.0,2018.0,1.0,1.0,4.0,6015.200964,9327.799036
5118,2018-01-01 05:00:00,15167.0,2018.0,1.0,1.0,5.0,5970.200964,9196.799036
5119,2018-01-01 06:00:00,15281.0,2018.0,1.0,1.0,6.0,5925.263464,9355.736536
5120,2018-01-01 07:00:00,14061.0,2018.0,1.0,1.0,7.0,5880.263464,8180.736536
5121,2018-01-01 08:00:00,13046.0,2018.0,1.0,1.0,8.0,5835.263464,7210.736536
5122,2018-01-01 09:00:00,13440.0,2018.0,1.0,1.0,9.0,5790.325964,7649.674036


In [14]:
df_tmp[df_tmp['AE'] > 0].AE.mean()

3555.926713043321