# 1. Importing the required libaries

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import sqlalchemy
import psycopg2
from sql import engine, conn

# 2. Loading the data into data frames

In [3]:
_50Hertz = pd.read_csv("Data/50Hertz.csv", header = 0, delimiter= ';' , decimal = ',', parse_dates= [['date', 'time']])

In [4]:
amprion = pd.read_csv("Data/Amprion.csv", header = 0, delimiter= ';', decimal = ',', parse_dates= [['date', 'time']])

In [5]:
tennet = pd.read_csv("Data/Tennet.csv", header = 0, delimiter= ';', decimal = ',', parse_dates= [['date', 'time']])

In [6]:
transnetbw = pd.read_csv("Data/TransnetBW.csv", header = 0, delimiter= ';', decimal = ',', parse_dates= [['date', 'time']])

# 3. Checking the types of the data and count of observations

In [7]:
df_list = [_50Hertz, amprion, tennet, transnetbw] # create a list with the dataframes
for df in df_list:
    print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315648 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date_time  315648 non-null  datetime64[ns]
 1   pred       315064 non-null  float64       
 2   act        315585 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 7.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315648 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date_time  315648 non-null  datetime64[ns]
 1   pred       314864 non-null  float64       
 2   act        315278 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 7.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315648 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------   

# 4. Checking and removing duplicated rows regarding the time change

In [8]:
for df in df_list:
    print(df[df.duplicated(['date_time'], keep = False)])

                 date_time    pred     act
28900  2012-10-28 02:00:00  457.11  345.00
28901  2012-10-28 02:15:00  437.22  347.50
28902  2012-10-28 02:30:00  427.33  350.00
28903  2012-10-28 02:45:00  417.44  352.50
28904  2012-10-28 02:00:00  407.56  355.00
...                    ...     ...     ...
309127 2020-10-25 02:45:00  219.09  238.68
309128 2020-10-25 02:00:00  220.56  242.86
309129 2020-10-25 02:15:00  222.21  242.04
309130 2020-10-25 02:30:00  223.94  240.07
309131 2020-10-25 02:45:00  225.84  238.07

[72 rows x 3 columns]
                 date_time    pred     act
28900  2012-10-28 02:00:00   94.13  229.70
28901  2012-10-28 02:15:00   94.13  212.01
28902  2012-10-28 02:30:00   94.13  208.50
28903  2012-10-28 02:45:00   94.13  197.11
28904  2012-10-28 02:00:00   81.60  185.00
...                    ...     ...     ...
309127 2020-10-25 02:45:00  246.00     NaN
309128 2020-10-25 02:00:00  246.00  238.00
309129 2020-10-25 02:15:00  247.00  237.00
309130 2020-10-25 02:30:00  247

In [9]:
for df in df_list:
    df.drop_duplicates(['date_time'], keep = False, inplace = True)

# 5. Checking and handling missing values

In [10]:
#Print the missing values
for df in df_list:
    print(df[df.isnull().any(axis=1)])

                 date_time    pred    act
172032 2016-11-27 00:00:00     NaN  289.0
172033 2016-11-27 00:15:00     NaN  295.0
172034 2016-11-27 00:30:00     NaN  300.0
172035 2016-11-27 00:45:00     NaN  315.0
172036 2016-11-27 01:00:00     NaN  318.0
...                    ...     ...    ...
311991 2020-11-23 21:45:00  130.69    NaN
311993 2020-11-23 22:15:00  124.16    NaN
311995 2020-11-23 22:45:00  118.85    NaN
311997 2020-11-23 23:15:00  114.05    NaN
311999 2020-11-23 23:45:00  109.61    NaN

[647 rows x 3 columns]
                 date_time    pred  act
4532   2012-02-17 05:00:00  516.66  NaN
6295   2012-06-03 13:45:00  161.64  NaN
7750   2012-03-21 17:30:00  127.04  NaN
8006   2012-03-24 09:30:00   44.97  NaN
9004   2012-03-04 20:00:00  118.80  NaN
...                    ...     ...  ...
303303 2020-08-25 10:45:00   40.00  NaN
305457 2020-09-16 21:15:00   69.00  NaN
311250 2020-11-16 04:30:00  254.00  NaN
311251 2020-11-16 04:45:00  252.00  NaN
311252 2020-11-16 05:00:00  251.

In [11]:
#Print the number of missing values per columns
for df in df_list:
    print(df.isnull().sum(),"\n")

date_time      0
pred         584
act           63
dtype: int64 

date_time      0
pred         772
act          346
dtype: int64 

date_time    0
pred         0
act          0
dtype: int64 

date_time      0
pred         386
act          312
dtype: int64 



In [12]:
#Replace the missing values with the forward fill method
for df in df_list:
    df['pred'].replace(to_replace = np.nan, method = 'ffill', inplace = True)
    df['act'].replace(to_replace = np.nan, method = 'ffill', inplace = True)

In [13]:
for df in df_list:
    print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315576 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date_time  315576 non-null  datetime64[ns]
 1   pred       315576 non-null  float64       
 2   act        315576 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 9.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 315576 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date_time  315576 non-null  datetime64[ns]
 1   pred       315576 non-null  float64       
 2   act        315576 non-null  float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 9.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 315576 entries, 0 to 315647
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------   

# 6. Setting the DateTime Index

In [14]:
for df in df_list:
   df.set_index('date_time', inplace = True)
   print(df)


                        pred     act
date_time                           
2012-01-01 00:00:00  1079.00  978.00
2012-01-01 00:15:00  1084.00  954.00
2012-01-01 00:30:00  1088.00  772.00
2012-01-01 00:45:00  1093.00  779.00
2012-01-01 01:00:00  1096.00  801.00
...                      ...     ...
2020-12-31 22:45:00   103.32   73.00
2020-12-31 23:00:00   100.73   70.11
2020-12-31 23:15:00    99.17   69.04
2020-12-31 23:30:00    97.75   63.21
2020-12-31 23:45:00    96.39   62.04

[315576 rows x 2 columns]
                       pred     act
date_time                          
2012-01-01 00:00:00  1256.0  1450.0
2012-01-01 00:15:00  1256.0  1536.0
2012-01-01 00:30:00  1256.0  1604.0
2012-01-01 00:45:00  1256.0  1576.0
2012-01-01 01:00:00  1219.0  1615.0
...                     ...     ...
2020-12-31 22:45:00    20.0     7.0
2020-12-31 23:00:00    20.0     8.0
2020-12-31 23:15:00    21.0     9.0
2020-12-31 23:30:00    21.0     9.0
2020-12-31 23:45:00    21.0     8.0

[315576 rows x 2 column

# 7. Transforming the data set in hourly

In [15]:
_50Hertz = _50Hertz.resample('H').sum()
amprion = amprion.resample('H').sum()
tennet = tennet.resample('H').sum()
transnetbw =transnetbw.resample('H').sum()

# 8. Calculating the Forecast Error

In [16]:
df_list = [_50Hertz, amprion, tennet, transnetbw]
for df in df_list:
    df['forecast error'] = df['pred'] - df['act']
print(f'{df}')


                      pred     act  forecast error
date_time                                         
2012-01-01 00:00:00  348.0  716.20         -368.20
2012-01-01 01:00:00  274.0  590.70         -316.70
2012-01-01 02:00:00  293.0  431.30         -138.30
2012-01-01 03:00:00  326.0  533.70         -207.70
2012-01-01 04:00:00  338.0  615.30         -277.30
...                    ...     ...             ...
2020-12-31 19:00:00  112.0   92.16           19.84
2020-12-31 20:00:00  112.0  106.32            5.68
2020-12-31 21:00:00  110.0  114.34           -4.34
2020-12-31 22:00:00  108.0  121.78          -13.78
2020-12-31 23:00:00  104.0  108.67           -4.67

[78912 rows x 3 columns]


In [17]:
print(f'{pd.concat([amprion.head(), amprion.tail()])}')

                       pred     act  forecast error
date_time                                          
2012-01-01 00:00:00  5024.0  6166.0         -1142.0
2012-01-01 01:00:00  4876.0  6191.0         -1315.0
2012-01-01 02:00:00  4944.0  6600.0         -1656.0
2012-01-01 03:00:00  5012.0  6569.0         -1557.0
2012-01-01 04:00:00  5124.0  6837.0         -1713.0
2020-12-31 19:00:00   102.0    31.0            71.0
2020-12-31 20:00:00    88.0    33.0            55.0
2020-12-31 21:00:00    80.0    33.0            47.0
2020-12-31 22:00:00    82.0    25.0            57.0
2020-12-31 23:00:00    83.0    34.0            49.0


# 9. Adding the control zone

In [18]:
df_list = [_50Hertz, amprion, tennet, transnetbw] # create a list with the resampled datasets
for df in df_list:
    if df is _50Hertz:
        df['control_zone'] = '50Hertz'
    elif df is amprion:
        df['control_zone'] = 'Amprion'
    elif df is tennet:
        df['control_zone'] = 'Tennet'
    else:
        df['control_zone'] = 'TransnetBW'

# 10. Combining the DataFrames

In [19]:
wind = pd.concat(df_list, axis = 0)

In [20]:
wind.head()

Unnamed: 0_level_0,pred,act,forecast error,control_zone
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01 00:00:00,4344.0,3483.0,861.0,50Hertz
2012-01-01 01:00:00,4496.0,3319.0,1177.0,50Hertz
2012-01-01 02:00:00,4765.0,3319.0,1446.0,50Hertz
2012-01-01 03:00:00,5004.0,3575.0,1429.0,50Hertz
2012-01-01 04:00:00,5321.0,3820.0,1501.0,50Hertz


# 11. Resetting the Datetime Index

In [21]:
wind = wind.reset_index(drop=False)
wind.head()

Unnamed: 0,date_time,pred,act,forecast error,control_zone
0,2012-01-01 00:00:00,4344.0,3483.0,861.0,50Hertz
1,2012-01-01 01:00:00,4496.0,3319.0,1177.0,50Hertz
2,2012-01-01 02:00:00,4765.0,3319.0,1446.0,50Hertz
3,2012-01-01 03:00:00,5004.0,3575.0,1429.0,50Hertz
4,2012-01-01 04:00:00,5321.0,3820.0,1501.0,50Hertz


# 12. Writing the prepared Data to the database

In [22]:
table_name = 'wind_LC' 
wind.to_sql(table_name, engine, index=False, if_exists="replace", 
    method='multi', chunksize=5000)
print(f"The {table_name} table was imported successfully.")

The wind_LC table was imported successfully.


In [23]:
# Query the newly created table
pd.read_sql_query('SELECT * FROM "wind_LC" LIMIT 5', conn)

Unnamed: 0,date_time,pred,act,forecast error,control_zone
0,2012-01-01 00:00:00,4344.0,3483.0,861.0,50Hertz
1,2012-01-01 01:00:00,4496.0,3319.0,1177.0,50Hertz
2,2012-01-01 02:00:00,4765.0,3319.0,1446.0,50Hertz
3,2012-01-01 03:00:00,5004.0,3575.0,1429.0,50Hertz
4,2012-01-01 04:00:00,5321.0,3820.0,1501.0,50Hertz
