# Save clean oil data

In [4]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
from lxml import etree

In [5]:
# Load data
df_eia = pd.read_excel("data/oildata/raw_data/oil_EIA_1987-2025.xls","Data 1")

In [6]:
# Oildata preparation
df_eia.columns = ["date","price"]
# Remove first two rows which include additional information
df_eia = df_eia.iloc[2:]
df_eia["datetime"] = pd.to_datetime(df_eia["date"])
df_eia["oilprice"] = df_eia["price"].astype(float)
# Resample data to get oil price at every date
df_eia = df_eia.set_index('datetime')
df_eia = df_eia["oilprice"].resample("1d").ffill().reset_index()
df_eia["month"] = df_eia["datetime"].dt.month
df_eia["year"] = df_eia["datetime"].dt.year

# Get average oil price by month and merge it to df_eia
month_avg_eia = df_eia[["oilprice","year","month"]].groupby(["year","month"]).mean().round(2).reset_index()
month_avg_eia.columns = ["year","month","oilprice_monthly_avg"]

df_eia_final = pd.merge(df_eia,month_avg_eia,on=["year","month"])
df_eia_final.drop(["year","month"],axis=1,inplace=True)


In [7]:
df_eia_final.head()

Unnamed: 0,datetime,oilprice,oilprice_monthly_avg
0,1987-05-20,18.63,18.58
1,1987-05-21,18.45,18.58
2,1987-05-22,18.55,18.58
3,1987-05-23,18.55,18.58
4,1987-05-24,18.55,18.58


In [8]:
df_eia_final.to_parquet("data/oildata/oil_EIA.parquet")

# Save clean exchange rate data

In [9]:
# Load xml
tree = etree.parse("data/exchange_rate/raw_data/usd.xml")
root = tree.getroot()

# define XML-Namespace
ns = {
    'mes': 'http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message',
    'exr': 'http://www.ecb.europa.eu/vocabulary/stats/exr/1'
}

# Find series
series = root.find('.//exr:Series', namespaces=ns)

# Extract all observations
data = []
for obs in series.findall('exr:Obs', namespaces=ns):
    datum = obs.attrib.get('TIME_PERIOD')
    wert = obs.attrib.get('OBS_VALUE')
    data.append((datum, float(wert)))

df_ex = pd.DataFrame(data,columns=["date","rate"])

In [10]:
# Prepare data
df_ex.dropna(inplace=True)
df_ex["datetime"] = pd.to_datetime(df_ex["date"])
df_ex["exchange_rate"] = df_ex["rate"].astype(float)
df_ex = df_ex.set_index('datetime')
df_ex = df_ex["exchange_rate"].resample("1d").ffill().reset_index()

In [11]:
df_ex.tail()

Unnamed: 0,datetime,exchange_rate
9639,2025-05-26,1.1381
9640,2025-05-27,1.1356
9641,2025-05-28,1.1317
9642,2025-05-29,1.1281
9643,2025-05-30,1.1339


In [12]:
df_ex.to_parquet("data/exchange_rate/exchange_rate.parquet")