# Data Merging

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

## Fetch data from file path

Get the csv file names from a given directory

In [3]:
def get_file_names(directory):
    """returns the csv files in the given string directory path"""
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_names.append(os.path.join(directory, filename))
    return file_names

Create a dictionary from the files where the keys are the yearly intervals and the values are the respective dataframes.

 ## Create DataFrames from data

In [4]:
def get_dataframes(file_names):
    """Takes in a list of csv file paths.
    Returns a dictionary whose keys are the years
    and values are the corresponding dataframes."""
    df_years = {}
    for file in file_names:
        df_years[file[-29:]] = pd.read_csv(file) # [-29:] Indexes the years.csv eg '201501010000-201601010000.csv'
    return df_years 

## Concatenate dataframes

In [5]:
def concat_dataframes(feat):
    keys = list(feat.keys())
    df = pd.concat([
        feat[keys[0]],  # year 2015
        feat[keys[1]],  # year 2016
        feat[keys[2]],  # year 2017
        feat[keys[3]],  # year 2018
        feat[keys[4]],  # year 2019
        feat[keys[5]]   # year 2020
    ]).reset_index(drop=True)
    return df

## Convert to datetime

On peeking through the dataframes nested in the dictionaries `price` and `load`, the columns showing the time display them as time ranges e.g `31.12.2020 19:00 - 31.12.2020 20:00`. A function is created to make a new column called _time_ that will strip the string down to the initial timestamp e.g `31.12.2020 19:00` and then convert the series to datetime objects

In [10]:
def get_datetime(df):
    try:
        column = df['MTU (CET)'] # column with time values in price df
    except:
        column = df['Time (CET)']  # load time
    
    # create new column 'time' by formatting the original time column to get single timepoint instead of a range
    df['time'] = column.apply(lambda _: _[:16])
    # convert new time column from str to timestamp
    df['time'] = pd.to_datetime(df['time'], format='%d-%m-%Y %H%M', errors='ignore')

        
    return df

In [11]:
df = get_file_names('../raw_data/price')
df = get_dataframes(df)
df = concat_dataframes(df)
df = get_datetime(df)

In [13]:
df.time[0]

'01.01.2015 00:00'

## All together

In [16]:
def fetch_data(path):
    
    files = get_file_names(path)
    df_dict = get_dataframes(files)
    
    df = concat_dataframes(df_dict)
    df = get_datetime(df)
    
    # date up until
    idx = (df[df['time'] == '23.11.2020 23:00'].index)[0] + 1 # valid time frame
    df = df.iloc[:idx]
    
    try:
        df.drop(columns=['MTU (CET)'], inplace=True)
        df = df.rename(columns={'Day-ahead Price [EUR/MWh]':'price'})
        df = df[['time','price']]
    except:
        df.drop(columns=['Time (CET)'], inplace=True)
        df = df.rename(columns={'Actual Total Load [MW] - BZN|DK1':'load'})
        df = df[['time','load']]

    return df

### Test

In [17]:
path = r'../raw_data/price'
path_l = r'../raw_data/load'
price = fetch_data(path) # unpack tuple 
load = fetch_data(path_l)

In [20]:
load.head(40)

Unnamed: 0,time,load
0,01.01.2015 00:00,1877
1,01.01.2015 01:00,1843
2,01.01.2015 02:00,1795
3,01.01.2015 03:00,1745
4,01.01.2015 04:00,1743
5,01.01.2015 05:00,1751
6,01.01.2015 06:00,1767
7,01.01.2015 07:00,1903
8,01.01.2015 08:00,1985
9,01.01.2015 09:00,2015


In [47]:
load.head()

Unnamed: 0,time,load
0,2015-01-01 00:00:00,1877
1,2015-01-01 01:00:00,1843
2,2015-01-01 02:00:00,1795
3,2015-01-01 03:00:00,1745
4,2015-01-01 04:00:00,1743


In [1]:
from electricity_price_predictor.data import fetch_data

In [2]:
price_df = fetch_data('../raw_data/price')

In [3]:
load_df = fetch_data('../raw_data/load')

In [4]:
#price_df.to_csv('../raw_data/final_price.csv', index=False)
#load_df.to_csv('../raw_data/final_load.csv', index=False)