# Structure of this Notebook:
For the analysis, the data regarding actual and forecast wind feed-in was downloaded from the information platform of the four German transmission system operators (TSO). In Excel, the actual and forecast wind feed-in was combined into an Excel spreadsheet, broken down by the four TSO. Also irrelevant columns like e.g. time to, were removed. 

In Section 1 the required libraries are imported. 


In the sections 2 to 5 the wind data from the four control area are prepared for the analysis. 
The following steps each relate to the data sets of the four TSOs. 
In section 6 the four data sets are combined into one data set and subjected to further processing.

# 1. Importing the required libaries

In [None]:
# Importing the required libaries
import pandas as pd
import numpy as np
import seaborn as sns #visualization
import matplotlib.pyplot as plt #visualization
from datetime import datetime , timedelta
import sqlalchemy
import psycopg2
from sql import engine

# 2. Control area: 50Hertz

## 2.1 Loading the data into data frames

In [None]:
wind_50Hertz= pd.read_excel("Data/wind_raw_data.xlsx", sheet_name = '50Hertz', header = 0, parse_dates= [['date', 'time']])

## 2.2 Checking the types of the data and count of observations

In [None]:
wind_50Hertz.info()

## 2.3 Checking and removing duplicated rows regarding the time change

In [None]:
wind_50Hertz[wind_50Hertz.duplicated(['date_time'], keep = False)]

In [None]:
wind_50Hertz.drop_duplicates(['date_time'], keep = False, inplace = True)

## 2.4 Checking and handling missing values

In [None]:
print(wind_50Hertz[wind_50Hertz.isnull().any(axis=1)])

In [None]:
#Print the number of missing values per columns
print(wind_50Hertz.isnull().sum(),"\n")

In [None]:
wind_50Hertz['pred'].replace(to_replace = np.nan, method = 'ffill', inplace = True)
wind_50Hertz['act'].replace(to_replace = np.nan, method = 'ffill', inplace = True)

## 2.5 Transforming the data set in hourly

In [None]:
wind_50Hertz = wind_50Hertz.set_index('date_time')
wind_50Hertz = wind_50Hertz.resample('H').sum()

In [None]:
wind_50Hertz.info()

## 2.6 Adding the controlzone

In [None]:
wind_50Hertz['control_zone'] = '50Hertz'
wind_amprion.info()

# 3. Control area: Amprion

## 3.1 Loading the data into a data frame

In [None]:
wind_amprion= pd.read_excel("Data/wind_raw_data.xlsx", sheet_name = '50Hertz', header = 0, parse_dates= [['date', 'time']])

## 3.2 Checking the types of the data and count of observations

In [None]:
wind_amprion.info()

## 3.3 Checking and removing duplicated rows regarding the time change

In [None]:
print(wind_amprion[wind_amprion.duplicated(['date_time'], keep = False)])

In [None]:
wind_amprion.drop_duplicates(['date_time'], keep = False, inplace = True)

## 3.4 Checking and handling missing values

In [None]:
#Print the number of missing values per columns
print(wind_amprion.isnull().sum(),"\n")

In [None]:
wind_amprion['pred'].replace(to_replace = np.nan, method = 'ffill', inplace = True)
wind_amprion['act'].replace(to_replace = np.nan, method = 'ffill', inplace = True)

In [None]:
wind_amprion.info()

## 3.5 Transforming the data set in hourly

In [None]:
wind_amprion = wind_amprion.set_index('date_time')
wind_amprion = wind_amprion.resample('H').sum()

In [None]:
wind_amprion.head()

## 3.6 Adding the control zone

In [None]:
wind_amprion['control_zone'] = 'Amprion'
wind_amprion.head()

# 4. Control zone: Tennet

## 4.1 Loading the data into a data frame

In [None]:
wind_tennet= pd.read_excel("Data/wind_raw_data.xlsx", sheet_name = 'Tennet', header = 0, parse_dates= [['date', 'time']])

## 4.2 Checking the types of the data and count of observations

In [None]:
wind_tennet.info()

## 4.3 Checking and removing duplicated rows regarding the time change

In [None]:
print(wind_tennet[wind_tennet.duplicated(['date_time'], keep = False)])

In [None]:
wind_tennet.drop_duplicates(['date_time'], keep = False, inplace = True)

## 4.4 Checking and handling missing values

In [None]:
#Print the number of missing values per columns
print(wind_tennet.isnull().sum(),"\n")

In [None]:
wind_tennet['pred'].replace(to_replace = np.nan, method = 'ffill', inplace = True)
wind_tennet['act'].replace(to_replace = np.nan, method = 'ffill', inplace = True)

In [None]:
wind_tennet.info()

## 4.5 Transforming the data set in hourly

In [None]:
wind_tennet = wind_tennet.set_index('date_time')
wind_tennet = wind_tennet.resample('H').sum()

In [None]:
wind_tennet.head()

## 4.6 Adding the control zone

In [None]:
wind_tennet['control_zone'] = 'Tennet'
wind_tennet.head()

# 5. Control area: TransnetBW

## 5.1 Loading the data into a data frame

In [None]:
wind_transnetbw= pd.read_excel("Data/wind_raw_data.xlsx", sheet_name = 'TransnetBW', header = 0, parse_dates= [['date', 'time']])

## 3.2 Checking the types of the data and count of observations

In [None]:
wind_transnetbw.info()

## 3.3 Checking and removing duplicated rows regarding the time change

In [None]:
print(wind_transnetbw[wind_transnetbw.duplicated(['date_time'], keep = False)])

In [None]:
wind_transnetbw.drop_duplicates(['date_time'], keep = False, inplace = True)

## 3.4 Checking and handling missing values

In [None]:
#Print the number of missing values per columns
print(wind_transnetbw.isnull().sum(),"\n")

In [None]:
wind_transnetbw['pred'].replace(to_replace = np.nan, method = 'ffill', inplace = True)
wind_transnetbw['act'].replace(to_replace = np.nan, method = 'ffill', inplace = True)

In [None]:
wind_transnetbw.info()

## 3.5 Transforming the data set in hourly

In [None]:
wind_transnetbw = wind_transnetbw.set_index('date_time')
wind_transnetbw = wind_transnetbw.resample('H').sum()

In [None]:
wind_transnetbw.head()

## 3.6 Adding the control zone

In [None]:
wind_transnetbw['control_zone'] = 'TransnetBW'
wind_transnetbw.head()

# 6. Final Data Set

## 6.1 Combining the DataFrames

In [None]:
wind = pd.concat([wind_50Hertz, wind_amprion, wind_tennet, wind_transnetbw], axis = 0)

## 6.2 Adding more Features

### 6.2.1 Forecast Error

In [None]:
wind['delta'] = wind['pred'] - wind['act']

### 6.2.1 Year

In [None]:
wind['year'] = wind.index.year
wind.head()

In [None]:
wind.info()

### 6.2.2 Month

In [None]:
wind['month'] = wind.index.month

### 6.2.3 Hour

In [None]:
wind['hour'] = wind.index.hour

In [None]:
wind.head()

### 6.2.4 Resetting the Datetime Index

In [None]:
wind = wind.reset_index(drop=False)
wind.head()

# 7. Pushing the the prepared data to a table in the database

In [None]:
table = 'wind_LC'
wind.to_sql(table, engine, index=False, if_exists="replace", 
    method='multi', chunksize=5000)
print('Successfully pushed!')