In [1]:
import pandas as pd
from pymongo import MongoClient
from collections import defaultdict
from urllib.parse import quote_plus
import requests

In [2]:
# Configurar conexión
usuario = "jalope"
contrasena = "admin"
host = "127.0.0.1"
puerto = "27250"

uri = f"mongodb://{quote_plus(usuario)}:{quote_plus(contrasena)}@{host}:{puerto}/?directConnection=true"
client = MongoClient(uri)
db = client["tfm_db"]

In [3]:
df = pd.read_csv("../data/VPOs/vpo_spain.csv", delimiter = ";")

In [4]:
df.head()

Unnamed: 0,total,year
0,44514,01/01/1991
1,35695,01/01/1992
2,45795,01/01/1993
3,67639,01/01/1994
4,71141,01/01/1995


In [8]:
df['year'] = pd.to_datetime(df['year'], format='%d/%m/%Y')

In [9]:
df.dtypes

total             int64
year     datetime64[ns]
dtype: object

In [10]:
coll = db['MINISTERIO_VPO']
coll.drop()
coll.insert_many(df.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df: ", len(df))

Registros en la colección:  34
Número de filas de df:  34


In [11]:
url = f"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/gov_10a_exp/1.0/*.*.*.*.*.*?c[freq]=A&c[unit]=PC_GDP&c[sector]=S13&c[cofog99]=GF06&c[na_item]=TE&c[geo]=EU27_2020,EA20,EA19&c[TIME_PERIOD]=2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990&compress=true&format=csvdata&formatVersion=2.0&lang=en&labels=name"

In [12]:
df_raw = requests.get(url, params=None)
df_raw.raise_for_status()
df_raw = pd.read_csv(
    url,
    sep=',',
    compression='gzip',
    dtype=str,        # o tu mapeo de dtypes
    na_values=[':']   # si quieres interpretar ':' como NaN
)

In [13]:
df_raw.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,sector,Sector,cofog99,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)
0,dataflow,ESTAT:GOV_10A_EXP(1.0),General government expenditure by function (CO...,A,Annual,PC_GDP,Percentage of gross domestic product (GDP),S13,General government,GF06,...,EA19,Euro area - 19 countries (2015-2022),1995,,1.1,,,,,
1,dataflow,ESTAT:GOV_10A_EXP(1.0),General government expenditure by function (CO...,A,Annual,PC_GDP,Percentage of gross domestic product (GDP),S13,General government,GF06,...,EA19,Euro area - 19 countries (2015-2022),1996,,0.7,,,,,
2,dataflow,ESTAT:GOV_10A_EXP(1.0),General government expenditure by function (CO...,A,Annual,PC_GDP,Percentage of gross domestic product (GDP),S13,General government,GF06,...,EA19,Euro area - 19 countries (2015-2022),1997,,0.8,,,,,
3,dataflow,ESTAT:GOV_10A_EXP(1.0),General government expenditure by function (CO...,A,Annual,PC_GDP,Percentage of gross domestic product (GDP),S13,General government,GF06,...,EA19,Euro area - 19 countries (2015-2022),1998,,0.8,,,,,
4,dataflow,ESTAT:GOV_10A_EXP(1.0),General government expenditure by function (CO...,A,Annual,PC_GDP,Percentage of gross domestic product (GDP),S13,General government,GF06,...,EA19,Euro area - 19 countries (2015-2022),1999,,0.8,,,,,


In [14]:
df_raw.dtypes

STRUCTURE                                                     object
STRUCTURE_ID                                                  object
STRUCTURE_NAME                                                object
freq                                                          object
Time frequency                                                object
unit                                                          object
Unit of measure                                               object
sector                                                        object
Sector                                                        object
cofog99                                                       object
Classification of the functions of government (COFOG 1999)    object
na_item                                                       object
National accounts indicator (ESA 2010)                        object
geo                                                           object
Geopolitical entity (reporting)   

In [16]:
# Seleccionar solo las columnas necesarias
df_clean = df_raw[['geo', 'TIME_PERIOD', 'OBS_VALUE']].copy()

# Renombrar columnas
df_clean.rename(columns={
    'TIME_PERIOD': 'year',
    'OBS_VALUE': 'pct_gdp'
}, inplace=True)

# Convertir tipos
df_clean['year'] = df_clean['year'].astype(int)
df_clean['pct_gdp'] = pd.to_numeric(df_clean['pct_gdp'], errors='coerce')
df_clean['geo'] = df_clean['geo'].astype(str)

In [17]:
df_clean.dtypes

geo         object
year         int64
pct_gdp    float64
dtype: object

In [19]:
df_clean

Unnamed: 0,geo,year,pct_gdp
0,EA19,1995,1.1
1,EA19,1996,0.7
2,EA19,1997,0.8
3,EA19,1998,0.8
4,EA19,1999,0.8
...,...,...,...
82,EU27_2020,2019,0.6
83,EU27_2020,2020,0.6
84,EU27_2020,2021,0.8
85,EU27_2020,2022,1.0


In [20]:
coll = db['EUROSTAT_GOV10aEXP_VIVIENDA']
coll.drop()
coll.insert_many(df_clean.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df_clean: ", len(df_clean))

Registros en la colección:  87
Número de filas de df_clean:  87


In [21]:
url = f"https://ec.europa.eu/eurostat/api/dissemination/sdmx/3.0/data/dataflow/ESTAT/gov_10a_exp/1.0/*.*.*.*.*.*?c[freq]=A&c[unit]=PC_GDP&c[sector]=S13&c[cofog99]=GF06&c[na_item]=TE&c[geo]=ES&c[TIME_PERIOD]=2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990&compress=true&format=csvdata&formatVersion=2.0&lang=en&labels=name"

In [22]:
df_raw = requests.get(url, params=None)
df_raw.raise_for_status()
df_raw = pd.read_csv(
    url,
    sep=',',
    compression='gzip',
    dtype=str,        # o tu mapeo de dtypes
    na_values=[':']   # si quieres interpretar ':' como NaN
)

In [23]:
# Seleccionar solo las columnas necesarias
df = df_raw[['geo', 'TIME_PERIOD', 'OBS_VALUE']].copy()

# Renombrar columnas
df.rename(columns={
    'TIME_PERIOD': 'year',
    'OBS_VALUE': 'pct_gdp'
}, inplace=True)

# Convertir tipos
df['year'] = df['year'].astype(int)
df['pct_gdp'] = pd.to_numeric(df['pct_gdp'], errors='coerce')
df['geo'] = df['geo'].astype(str)

In [24]:
df

Unnamed: 0,geo,year,pct_gdp
0,ES,1995,1.0
1,ES,1996,1.0
2,ES,1997,1.0
3,ES,1998,1.1
4,ES,1999,1.1
5,ES,2000,1.2
6,ES,2001,1.0
7,ES,2002,1.0
8,ES,2003,1.1
9,ES,2004,0.8


In [28]:
# Asegúrate de que ambas tablas tengan los mismos tipos
df_clean['year']  = df_clean['year'].astype(int)
df_clean['geo']   = df_clean['geo'].astype(str)
df_clean['pct_gdp'] = pd.to_numeric(df_clean['pct_gdp'], errors='coerce')

df['year']  = df['year'].astype(int)
df['geo']   = df['geo'].astype(str)
df['pct_gdp'] = pd.to_numeric(df['pct_gdp'], errors='coerce')

In [33]:
df_combined = pd.concat([df_clean, df], ignore_index=True)

In [34]:
df_combined

Unnamed: 0,geo,year,pct_gdp
0,EA19,1995,1.1
1,EA19,1996,0.7
2,EA19,1997,0.8
3,EA19,1998,0.8
4,EA19,1999,0.8
...,...,...,...
111,ES,2019,0.4
112,ES,2020,0.5
113,ES,2021,0.5
114,ES,2022,0.5


In [36]:
coll = db['EUROSTAT_GOV10aEXP_VIVIENDA']
coll.drop()
coll.insert_many(df_combined.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df_clean: ", len(df_combined))

Registros en la colección:  116
Número de filas de df_clean:  116
