# Business Data Analytics - Challenge

---

## Package Import

In [2]:
import pandas as pd
import numpy as np
import re
import os


import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter


---

## Data Import

In [3]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
numberparse = lambda x: pd.np.float(x.replace(".", "").replace(",",".")) if x!="-" else np.nan
convert_thousand = { num:numberparse   for num in np.arange(2,22)}

In [4]:
import_files = os.listdir("data/Stromfluss/")

for idx , file in enumerate(import_files):
    print("Import File: {} ".format(file))
    PATH = "data/Stromfluss/" + file
    if idx>0:
        df2 = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse )
        df = df.append(df2)
    else:
        df = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse)
del df2

Import File: DE_Physikalischer_Stromfluss_201506010000_201706012359_1.csv 
Import File: DE_Physikalischer_Stromfluss_201706020000_201906022359_1.csv 


In [5]:
df.head(2)

Unnamed: 0,Datum,Uhrzeit,Physikalischer Nettoexport[MWh],Niederlande (Export)[MWh],Niederlande (Import)[MWh],Schweiz (Export)[MWh],Schweiz (Import)[MWh],Dänemark (Export)[MWh],Dänemark (Import)[MWh],Tschechien (Export)[MWh],...,Luxemburg (Export)[MWh],Luxemburg (Import)[MWh],Schweden (Export)[MWh],Schweden (Import)[MWh],Österreich (Export)[MWh],Österreich (Import)[MWh],Frankreich (Export)[MWh],Frankreich (Import)[MWh],Polen (Export)[MWh],Polen (Import)[MWh]
0,2015-06-01,00:00,,3932.0,-0.0,145.0,-0.0,113.0,-385.0,912.0,...,,,1.0,-0.0,1394.0,-0.0,246.0,-0.0,1279.0,-0.0
1,2015-06-01,01:00,,4014.0,-0.0,261.0,-0.0,63.0,-204.0,839.0,...,,,1.0,-0.0,1355.0,-0.0,66.0,-0.0,1142.0,-0.0


---

## Preprocessing

#### Time

In [6]:
def _create_datetime(row):
    date = row.Datum.strftime("%Y-%m-%d") + " " + row.Uhrzeit
    return date

In [7]:
df['Date'] = df.apply(lambda row: _create_datetime(row), axis=1) 
df['Date'] = pd.to_datetime(df.Date,format="%Y-%m-%d %H:%M")
df = df.sort_values('Date').reset_index(drop=True)

cols = list(df)
cols.insert(0, cols.pop(cols.index('Date')))
df = df.loc[:, cols]
df.drop(['Uhrzeit'],axis=1,inplace=True)

#### Columns

In [8]:
type_pattern = r"\((.*?)\)"
country_pattern = r"(.*?) "

In [9]:
countries = {
    'Niederlande':'NL',
    'Schweiz': 'CHE',
    'Dänemark' : 'DNK',
    'Tschechien' : 'CZE',
    'Luxemburg' : 'LUX',
    'Schweden' : 'SWE',
    'Österreich' : 'AUT',
    'Frankreich' : 'FRA',
    'Polen' : 'PL'
} 

types = {
    'Import' : 'IM',
    'Export' : 'EX'
}

new_columns = [countries.get(re.search(country_pattern,col).group(1)) + 
 "_" + 
 types.get(re.search(type_pattern,col).group(1))
 for col in df.columns[3::]]
new_columns.insert(0,'Date')
new_columns.insert(1,'Tag')
new_columns.insert(2,'NX')

df.columns = new_columns

#### NX

In [10]:
export_columns = [col for col in df.columns if col[-2::]=='EX' ]
import_columns = [col for col in df.columns if col[-2::]=='IM' ]

In [11]:
df.columns

Index(['Date', 'Tag', 'NX', 'NL_EX', 'NL_IM', 'CHE_EX', 'CHE_IM', 'DNK_EX',
       'DNK_IM', 'CZE_EX', 'CZE_IM', 'LUX_EX', 'LUX_IM', 'SWE_EX', 'SWE_IM',
       'AUT_EX', 'AUT_IM', 'FRA_EX', 'FRA_IM', 'PL_EX', 'PL_IM'],
      dtype='object')

In [12]:
df['NX'] = df.loc[:, 'NL_EX':'PL_IM'].sum(axis=1)

#### Fill Nones

In [13]:
df = df.fillna(0)

---

# Strompreise

In [59]:
import_files = os.listdir("data/Strompreise/")

In [60]:
import_files

['DE_Großhandelspreise_201706010000_201906012359_1.csv',
 'DE_Großhandelspreise_201506010000_201706012359_1.csv']

In [61]:
for idx , file in enumerate(import_files):
    print("Import File: {} ".format(file))
    PATH = "data/Strompreise/" + file
    if idx>0:
        df2 = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse )
        df_price = df_price.append(df2)
    else:
        df_price = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse)
del df2

Import File: DE_Großhandelspreise_201706010000_201906012359_1.csv 
Import File: DE_Großhandelspreise_201506010000_201706012359_1.csv 


In [62]:
df_price['Date'] = df_price.apply(lambda row: _create_datetime(row), axis=1) 
df_price['Date'] = pd.to_datetime(df_price.Date,format="%Y-%m-%d %H:%M")
df_price = df_price.sort_values('Date').reset_index(drop=True)

cols = list(df_price)
cols.insert(0, cols.pop(cols.index('Date')))
df_price = df_price.loc[:, cols]
df_price.drop(['Uhrzeit'],axis=1,inplace=True)

In [63]:
delete_currency = r"(.*?)\["

In [64]:
new_columns = ["price_"+ re.search(delete_currency,col).group(1).lower()
 for col in df_price.columns[2::]]
new_columns.insert(0,'Date')
new_columns.insert(1,'Tag')

df_price.columns = new_columns

In [65]:
df_price.fillna(df_price.mean(),inplace=True)

In [66]:
time_shift = 24
rolling_window = 24
df_price.iloc[:,2::] = df_price.iloc[:,2::].shift(time_shift).rolling(rolling_window).mean()
df_price.dropna(inplace=True)

---

# Realisierter Stromverbrauch

In [89]:
import_files = os.listdir("data/Stromverbrauch_real/")

for idx , file in enumerate(import_files):
    print("Import File: {} ".format(file))
    PATH = "data/Stromverbrauch_real/" + file
    
    if idx>0:
        df2 = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse )
        df_consumption = df_consumption.append(df2)
    else:
        df_consumption = pd.read_csv(PATH,
            sep=r";",
            decimal=r",",
            thousands=r".",
            converters = convert_thousand,
            parse_dates=['Datum'],
            date_parser = dateparse)

del df2

Import File: DE_Realisierter Stromverbrauch_201506010000_201706012345_1.csv 
Import File: DE_Realisierter Stromverbrauch_201706010000_201906012345_1.csv 


In [90]:
df_consumption = df_consumption.groupby("Datum").sum()

In [91]:
df_consumption.columns = ["daily_consumption_ger"]

In [92]:
day_shift = 1
df_consumption = df_consumption.shift(day_shift)
df_consumption.dropna(inplace=True)

In [93]:
df_consumption['Tag'] = df_consumption.index

---

# Feature Extraction Stromfluss DF

### Vortag stündlicher Nettoexport pro Land 

In [37]:
hour_shift = 24

In [40]:
prev_day_nx = df.iloc[:,2::].groupby(lambda x: x.split('_')[0], axis=1).sum()
prev_day_nx['Date'] = df['Date']
prev_day_nx.loc[:,prev_day_nx.columns!='Date'] = prev_day_nx.loc[:,prev_day_nx.columns!='Date'].shift(hour_shift)
prev_day_nx.dropna(inplace=True)

columns = [col +"_nx_prev_day" for col in prev_day_nx.loc[:,prev_day_nx.columns!='Date'].columns]
columns.insert(len(columns)+1,'Date')
prev_day_nx.columns = columns
df = pd.merge(df,prev_day_nx,on="Date",how="inner")

In [56]:
df.head()

Unnamed: 0,Date,Tag,NX,NL_EX,NL_IM,CHE_EX,CHE_IM,DNK_EX,DNK_IM,CZE_EX,...,LUX_EX,LUX_IM,SWE_EX,SWE_IM,AUT_EX,AUT_IM,FRA_EX,FRA_IM,PL_EX,PL_IM
0,2015-06-01 00:00:00,2015-06-01,7279.0,3932.0,-0.0,145.0,-0.0,113.0,-385.0,912.0,...,0.0,0.0,1.0,-0.0,1394.0,-0.0,246.0,-0.0,1279.0,-0.0
1,2015-06-01 01:00:00,2015-06-01,7167.0,4014.0,-0.0,261.0,-0.0,63.0,-204.0,839.0,...,0.0,0.0,1.0,-0.0,1355.0,-0.0,66.0,-0.0,1142.0,-0.0
2,2015-06-01 02:00:00,2015-06-01,6667.0,3864.0,-0.0,117.0,-14.0,206.0,-532.0,751.0,...,0.0,0.0,1.0,-0.0,1574.0,-0.0,0.0,-105.0,1134.0,-0.0
3,2015-06-01 03:00:00,2015-06-01,6413.0,4242.0,-0.0,63.0,-21.0,134.0,-396.0,555.0,...,0.0,0.0,1.0,-0.0,1271.0,-0.0,0.0,-206.0,1090.0,-0.0
4,2015-06-01 04:00:00,2015-06-01,5347.0,4292.0,-0.0,3.0,-130.0,67.0,-649.0,568.0,...,0.0,0.0,1.0,-0.0,1068.0,-0.0,0.0,-549.0,1083.0,-0.0


---