In [1]:
import requests
import pandas as pd
from constants import taxud_mapping, years_filtered, headers
from utils import process_calendar_year, process_crop_year  # Import functions

# List to store individual DataFrames
dfs_TAXUD = []

for product in taxud_mapping:
    sector = product["sector"]
    year_type = product['year']
    HS_codes = product["HS codes"]
    product_name = product["name"]  # Access the name field

    url = f"https://www.ec.europa.eu/agrifood/api/taxud/weeklyData/import?importCategories=Import%20-%20preferential&importCategories=Import%20-%20most favoured nation&sectors={sector}&{HS_codes}"
    headers = headers

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json() 
    else:
        print("Failed to retrieve data")
        data = None

    if data:
        df = pd.DataFrame(data)
        
        if year_type == 'calendar':
            monthly_data = process_calendar_year(df,product_name)
        else:
            monthly_data = process_crop_year(df,product_name)
        
        monthly_data = monthly_data.rename(columns={'kgEquivalent': product_name})  # Rename column with product name
        dfs_TAXUD.append(monthly_data)
    else:
        print("No data to convert to DataFrame")

# Merge all DataFrames on 'date'
if dfs_TAXUD:
    merged_df = dfs_TAXUD[0]
    for df in dfs_TAXUD[1:]:
        merged_df = pd.merge(merged_df, df, on='date', how='outer').dropna()
    
    # Optionally rename the date column if needed
    merged_df = merged_df.rename(columns={'date': 'Date'})

    # Print the merged DataFrame
    print(merged_df.head(10))
else:
    print("No dataframes to merge")


        Date  TAXUD_SMP
0 2011-01-01   18.60145
1 2011-02-01   11.52825
2 2011-03-01    0.90160
3 2011-04-01    0.81960
4 2011-05-01    3.51860
5 2011-06-01    1.58327
6 2011-07-01    0.27955
7 2011-08-01   13.80390
8 2011-09-01    1.70283
9 2011-10-01   20.44796


In [2]:
# main_script.py
import os
from database_connection import query_data


a_ssh_host = '18.195.40.197'
a_ssh_user = 'forge'
a_ssh_port = 22
a_ssh_private_key = '/Users/germankosenkov/.ssh/id_rsa'
a_sql_hostname = 'production.ccq0tjftm3pw.eu-central-1.rds.amazonaws.com'
a_sql_username = 'read_only'
a_sql_password = 'uC9wkjxyBML5MhWwHLfUM'
a_sql_database = 'vesper'
a_sql_port = 3306

In [3]:
from constants import GTT_mapping
import pandas as pd
import os
from dotenv import load_dotenv
from database_connection import ssh_tunnel, db_connection, query_data

# Load environment variables from .env
load_dotenv()


dfs_GTT = []

with ssh_tunnel(a_ssh_host, a_ssh_port, a_ssh_user, a_ssh_private_key, a_sql_hostname, a_sql_port) as local_port:
    with db_connection(local_port, a_sql_username, a_sql_password, a_sql_database) as conn:
        for item in GTT_mapping:
            GTT_code = item['GTT_code']
            name = item['name']

            query = f'''SELECT date, amount
                        FROM vesper.total_import_export_figures
                        WHERE country_id = 200 AND hs_code = {GTT_code} AND data_interval = 'monthly' AND type = 1 AND date > '2010-12-31'
                        ORDER BY date ASC'''

            result = query_data(conn, query)
            result['date'] = pd.to_datetime(result['date'])
            result = result.rename(columns={'amount': f'GTT_{name}'})

            dfs_GTT.append(result)

if dfs_GTT:
    merged_df = dfs_GTT[0]
    for df in dfs_GTT[1:]:
        merged_df = pd.merge(merged_df, df, on='date', how='outer').dropna()
    
    merged_df = merged_df.rename(columns={'date': 'Date'})
    print(merged_df.head(10))
else:
    print("No dataframes to merge")

        Date  GTT_SMP
0 2011-01-01  1534.11
1 2011-02-01  1306.29
2 2011-03-01  1423.56
3 2011-04-01   632.95
4 2011-05-01  1797.71
5 2011-06-01  2211.28
6 2011-07-01   908.22
7 2011-08-01   557.45
8 2011-09-01  1263.04
9 2011-10-01  2429.61


In [4]:
all_dfs = dfs_TAXUD + dfs_GTT

def merge_dfs_on_date(df_list):
    merged_df = df_list[0]
    for df in df_list[1:]:
        merged_df = pd.merge(merged_df, df, on='date', how='outer')
    return merged_df

# Merge all dataframes
merged_df = merge_dfs_on_date(all_dfs).dropna()

merged_df = pd.DataFrame(merged_df)

print(merged_df.head(5))

        date  TAXUD_SMP  GTT_SMP
0 2011-01-01   18.60145  1534.11
1 2011-02-01   11.52825  1306.29
2 2011-03-01    0.90160  1423.56
3 2011-04-01    0.81960   632.95
4 2011-05-01    3.51860  1797.71


In [5]:
import matplotlib.pyplot as plt


merged_df['year'] = merged_df['date'].dt.year

# Group by year and sum the values
annual_data = merged_df.groupby('year').sum().reset_index()

# Flatten the MultiIndex columns
annual_data.columns = ['year'] + [f'annual_{col}' for col in annual_data.columns if col != 'year']

pd.set_option('display.float_format', '{:.2f}'.format) 


def percentage_difference(col1, col2):
    return (col1 - col2) / col2 * 100

suffixes = set(col.split('_')[1] for col in merged_df.columns if '_' in col)

yearly_stats_list = []



for suffix in suffixes:
    taxud_col = f'annual_TAXUD_{suffix}'
    gtt_col = f'annual_GTT_{suffix}'
    if taxud_col in annual_data.columns and gtt_col in annual_data.columns:
        diff_col = f'%_diff_{suffix}'
        annual_data[diff_col] = percentage_difference(annual_data[taxud_col], annual_data[gtt_col])

        yearly_stats_list.append(annual_data[['year',diff_col]])

if yearly_stats_list:
    annual = yearly_stats_list[0]
    for df in yearly_stats_list[1:]:
        annual = pd.merge(annual, df, on='year')
    
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_columns', None)
    print(annual)
else:
    print("No dataframes to merge")




    year  %_diff_SMP
0   2011      -99.51
1   2012      -98.98
2   2013      -99.39
3   2014      -97.03
4   2015      -96.03
5   2016      -98.61
6   2017      -99.73
7   2018      -97.60
8   2019      -99.29
9   2020      -98.21
10  2021      -59.76
11  2022      -51.58
12  2023      -42.35
13  2024      -45.13


In [6]:
import pandas as pd

def percentage_difference(col1, col2):
    return (col1 - col2) / col2 * 100


suffixes = set(col.split('_')[1] for col in merged_df.columns if '_' in col)

merged_df['year'] = merged_df['date'].dt.year
merged_df['month'] = merged_df['date'].dt.month

filtered_df_1 = merged_df[merged_df['year'].isin(years_filtered)]
monthly_stats_list = []

for suffix in suffixes:
    taxud_col = f'TAXUD_{suffix}'
    gtt_col = f'GTT_{suffix}'
    if taxud_col in merged_df.columns and gtt_col in merged_df.columns:
        diff_col = f'%_diff_{suffix}'
        merged_df[diff_col] = percentage_difference(merged_df[taxud_col], merged_df[gtt_col])
        filtered_df_1[diff_col] = merged_df[diff_col]  # Ensure the column is also in filtered_df_1

        if diff_col in filtered_df_1.columns:
            
            monthly_stats = filtered_df_1.groupby('month')[diff_col].agg(['mean', 'min', 'max']).reset_index()

            monthly_stats.columns = ['month', f'average_percentage_difference_{suffix}', f'min_percentage_difference_{suffix}', f'max_percentage_difference_{suffix}']

            monthly_stats[f'maximum_error_{suffix}'] = monthly_stats.apply(
                lambda row: max(abs(row[f'min_percentage_difference_{suffix}']), abs(row[f'max_percentage_difference_{suffix}'])) - abs(row[f'average_percentage_difference_{suffix}']),
                axis=1
            )

            monthly_stats_list.append(monthly_stats[['month', f'maximum_error_{suffix}']])

if monthly_stats_list:
    monthly = monthly_stats_list[0]
    for df in monthly_stats_list[1:]:
        monthly = pd.merge(monthly, df, on='month', how='outer')
    
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_columns', None)
    print(monthly)
else:
    print("No dataframes to merge")


    month  maximum_error_SMP
0       1              11.00
1       2              14.15
2       3              16.34
3       4              15.87
4       5              23.38
5       6              25.58
6       7              20.88
7       8              25.10
8       9              18.66
9      10              25.39
10     11              20.02
11     12              13.09


Probability of worst case scenario