In [0]:
# File: 250504-final_code_with_web_data.ipynb
# Created with assistance of ChatGPT (OpenAI) – reviewed on 2025-05-04
# Author: Maria Heinrich


In [0]:
import time

start_time = time.time()

In [0]:
!pip install Faker

In [0]:
import pandas as pd
from datetime import datetime, timedelta
import requests
from io import StringIO
import numpy as np
from faker import Faker

# Exchange rate table

In [0]:
# Fetch data from the URL
url = "https://www.oenb.at/oearb/zinssatzwechselkurse/download-zeitreihe?start=2023-01-01&end=2023-06-30&codes=AUD,BGN,BRL,CAD,CHF,CNY,CZK,DKK,GBP,HKD,HUF,IDR,ILS,INR,ISK&format=CSV"
response = requests.get(url)

# Read the CSV data
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data, delimiter=';')

# Parse the date string from the first row
date_str = df.at[0, 'Datum']
date_format = '%Y-%m-%d'  # Changed date format
datum = datetime.strptime(date_str, date_format)

# Subtract one day from the date
new_date = datum - timedelta(days=1)

# Convert the new date back to the string format
new_date_str = new_date.strftime(date_format)

# Replace the date in the first row with the new date
df.at[0, 'Datum'] = new_date_str

# Melt the DataFrame to create the desired table structure
melted_df = df.melt(id_vars=["Datum"], var_name="CurrencyCode", value_name="ExchangeRate")

melted_df.rename(columns={"Datum": "Date"}, inplace=True)

melted_df['Date'] = pd.to_datetime(melted_df['Date'], format='%Y-%m-%d')

# DataFrame updated with the melted DataFrame
df = melted_df

# Convert 'date' column to datetime if it's not already
df['Date'] = pd.to_datetime(df['Date'])

# Set 'date' column as the index temporarily to fill the missing values for each currency
df.set_index('Date', inplace=True)

# Group by 'CurrencyCode' and fill missing values for each group separately
df['ExchangeRate'] = df.groupby('CurrencyCode')['ExchangeRate'].ffill()

# Reset index to move 'date' back to a column
df.reset_index(inplace=True)

# If needed, convert 'date' back to string format
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Define the start and end dates
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 6, 30)  # Adjust the end date as needed

# Generate a list of dates in the range
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

# Convert date_list to a DataFrame
date_df = pd.DataFrame({'Date': date_list})

# Initialize an empty DataFrame to store results
result_df = pd.DataFrame()

# Iterate through each currency code
for currency_code in df['CurrencyCode'].unique():
    # Filter the DataFrame for the current currency code
    currency_df = df[df['CurrencyCode'] == currency_code]
    
    # Group by 'date' and aggregate the data
    grouped_df = currency_df.groupby('Date').agg({'ExchangeRate': 'first'}).reset_index()
    
    # Convert 'date' column to datetime
    grouped_df['Date'] = pd.to_datetime(grouped_df['Date'])
    
    # Merge date_df with the aggregated DataFrame on 'date' to match the dates
    merged_df = pd.merge(date_df, grouped_df, on='Date', how='left')
    
    # Forward fill missing values for the current currency
    merged_df['ExchangeRate'].fillna(method='ffill', inplace=True)
    
    # Add 'CurrencyCode' column
    merged_df['CurrencyCode'] = currency_code
    
    # Append to the result DataFrame
    result_df = pd.concat([result_df, merged_df])

# Sort the result DataFrame by 'date'
result_df.sort_values(by='Date', inplace=True)

# Reset index
result_df.reset_index(drop=True, inplace=True)

# Replace commas with periods and convert to float
result_df['ExchangeRate'] = result_df['ExchangeRate'].str.replace('.', '').str.replace(',', '.').astype(float)

# Format ExchangeRate to 5 digits after the comma
result_df['ExchangeRate'] = result_df['ExchangeRate'].map(lambda x: f"{x:.4f}")

print(result_df.head(25))
print(result_df.tail(25))
result_df.to_csv('exchange_rates.csv', index=False)

# Fake accounting data

In [0]:
# Initialize Faker object
faker = Faker()

# Define the start and end dates
start_date = pd.to_datetime('2023-01-01').timestamp()
end_date = pd.to_datetime('2023-06-30').timestamp()

# Generate 500,000 random dates between the start and end dates
random_timestamps = np.random.randint(start_date, end_date, size=500000)
random_dates = pd.to_datetime(random_timestamps, unit='s')

# Generate 500,000 random figures between 0.01 and 150,000 and limit to 2 decimal places
random_figures = np.round(np.random.uniform(0.01, 150000, size=500000), 2)

# Generate 500,000 random text strings limited to 50 characters
random_texts = [faker.text(max_nb_chars=50) for _ in range(500000)]

# Generate 500,000 random currency codes from the provided list
currency_codes = ['AUD', 'BGN', 'BRL', 'CAD', 'CHF', 'CNY', 'CZK', 'DKK', 'GBP', 'HKD', 'HUF', 'IDR', 'ILS', 'INR', 'ISK']
random_currency_codes = np.random.choice(currency_codes, size=500000)

# Generate 500,000 random account numbers between 00100 and 99999
random_account_numbers = np.random.randint(100, 100000, size=500000)

# Create a DataFrame with the specified column order
df = pd.DataFrame({
    'Date': random_dates,
    'Text': random_texts,
    'Account_Number': random_account_numbers,
    'Amount': random_figures,
    'CurrencyCode': random_currency_codes,
})

print("Shape of the DataFrame:", df.shape)
print("Head of the DataFrame:")
print(df.head())

# Save the DataFrame to a CSV file
# df.to_csv('Accounting_Data.csv', index=False)



# Exchange rate calculation

In [0]:
# Load the existing DataFrame
df = pd.read_csv('Accounting_Data.csv')

# Load the exchange rates DataFrame
exchange_rates = pd.read_csv('exchange_rates.csv', decimal='.', parse_dates=['Date'])

# Convert the 'Date' column in the 'df' DataFrame to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Merge the DataFrames based on 'CurrencyCode' and 'Date', keeping all rows from 'df'
merged_df = pd.merge(df, exchange_rates, how='left', on=['CurrencyCode', 'Date'])

# Convert 'ExchangeRate' to float
#merged_df['ExchangeRate'] = merged_df['ExchangeRate'].str.replace('.', '').str.replace(',', '.').astype(float)

# Calculate 'Amount' in EUR
merged_df['EUR_Amount'] = merged_df['Amount'] / merged_df['ExchangeRate']

# Round 'EUR_Amount' column to 2 decimal places
merged_df['EUR_Amount'] = merged_df['EUR_Amount'].round(2)

# Reorder columns
merged_df = merged_df[['Date', 'Text', 'Account_Number', 'Amount', 'CurrencyCode', 'EUR_Amount', 'ExchangeRate']]

# Save the updated DataFrame to a new CSV file
merged_df.to_csv('Accounting_Data_with_EUR_final.csv', index=False)

print("DataFrame with EUR amount rounded to 2 decimal places saved to 'Accounting_Data_with_EUR.csv'")

In [0]:
merged_df

In [0]:
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")