In [1]:
import pandas as pd
from pymongo import MongoClient
from collections import defaultdict
from urllib.parse import quote_plus
import requests

In [3]:
url = f"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/gov_10a_exp/1.0/*.*.*.*.*.*?c[freq]=A&c[unit]=PC_GDP&c[sector]=S13&c[cofog99]=GF09,GF1002&c[na_item]=P2_D29_D5_D8,TE&c[geo]=EU27_2020,ES&c[TIME_PERIOD]=2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990&compress=true&format=csvdata&formatVersion=2.0&lang=en&labels=both"

In [5]:
df_raw = requests.get(url, params=None)
df_raw.raise_for_status()
df_raw = pd.read_csv(
    url,
    sep=',',
    compression='gzip',
    dtype=str,        # o tu mapeo de dtypes
    na_values=[':']   # si quieres interpretar ':' como NaN
)

In [6]:
df_raw

Unnamed: 0,STRUCTURE,STRUCTURE_ID,freq: Time frequency,unit: Unit of measure,sector: Sector,cofog99: Classification of the functions of government (COFOG 1999),na_item: National accounts indicator (ESA 2010),geo: Geopolitical entity (reporting),TIME_PERIOD: Time,OBS_VALUE: Observation value,OBS_FLAG: Observation status (Flag) V2 structure,CONF_STATUS: Confidentiality status (flag)
0,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,ES: Spain,1995,0.3,,
1,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,ES: Spain,1996,0.3,,
2,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,ES: Spain,1997,0.3,,
3,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,ES: Spain,1998,0.3,,
4,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF09: Education,P2_D29_D5_D8: Intermediate consumption; other ...,ES: Spain,1999,0.3,,
...,...,...,...,...,...,...,...,...,...,...,...,...
163,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF1002: Old age,TE: Total general government expenditure,EU27_2020: European Union - 27 countries (from...,2019,10.4,,
164,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF1002: Old age,TE: Total general government expenditure,EU27_2020: European Union - 27 countries (from...,2020,11.3,,
165,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF1002: Old age,TE: Total general government expenditure,EU27_2020: European Union - 27 countries (from...,2021,10.7,,
166,dataflow,ESTAT:GOV_10A_EXP(1.0),A: Annual,PC_GDP: Percentage of gross domestic product (...,S13: General government,GF1002: Old age,TE: Total general government expenditure,EU27_2020: European Union - 27 countries (from...,2022,10.3,,


In [8]:
df = df_raw[
    (df_raw['geo: Geopolitical entity (reporting)'] == 'ES: Spain') &
    (df_raw['cofog99: Classification of the functions of government (COFOG 1999)']
         .isin(['GF09: Education', 'GF1002: Old age'])) &
    (df_raw['na_item: National accounts indicator (ESA 2010)']
         == 'TE: Total general government expenditure') &
    (df_raw['sector: Sector'] == 'S13: General government') &
    (df_raw['unit: Unit of measure'] == 'PC_GDP: Percentage of gross domestic product (GDP)') &
    (df_raw['freq: Time frequency'] == 'A: Annual')
].copy()

# luego renombre año, cofog, value como antes y pivot:
df = df.rename(columns={
    'TIME_PERIOD: Time': 'year',
    'cofog99: Classification of the functions of government (COFOG 1999)': 'cofog',
    'OBS_VALUE: Observation value': 'value'
})
df['year'] = df['year'].astype(int)
df['value'] = pd.to_numeric(df['value'], errors='coerce')

df_wide = df.pivot(index='year', columns='cofog', values='value').reset_index()
df_wide.columns.name = None
df_wide = df_wide.rename(columns={
    'GF09: Education':  'education_pct_gdp',
    'GF1002: Old age':  'pensions_pct_gdp'
})


In [10]:
df_wide.head()

Unnamed: 0,year,education_pct_gdp,pensions_pct_gdp
0,1995,4.3,6.6
1,1996,4.3,6.7
2,1997,4.2,6.6
3,1998,4.1,6.5
4,1999,4.2,6.4


In [11]:
# Configurar conexión
usuario = "jalope"
contrasena = "admin"
host = "127.0.0.1"
puerto = "27250"

uri = f"mongodb://{quote_plus(usuario)}:{quote_plus(contrasena)}@{host}:{puerto}/?directConnection=true"
client = MongoClient(uri)
db = client["tfm_db"]

In [12]:
coll = db['EUROSTAT_GOV10A_EXP_GDP']
coll.drop()
coll.insert_many(df_wide.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df_raw: ", len(df_wide))

Registros en la colección:  29
Número de filas de df_raw:  29
