In [21]:
import pandas as pd
from pymongo import MongoClient
from collections import defaultdict
from urllib.parse import quote_plus
import requests

In [22]:
url_2023 = f"https://servicios.ine.es/wstempus/js/ES/DATOS_TABLA/59777"
url_2019 = f"https://servicios.ine.es/wstempus/js/ES/DATOS_TABLA/33525"

In [23]:
resp_2023 = requests.get(url_2023, params=None)
resp_2023.raise_for_status()
data_2023 = resp_2023.json()

resp_2019 = requests.get(url_2019, params=None)
resp_2019.raise_for_status()
data_2019 = resp_2019.json()

In [24]:
age_labels_2019 = [
    item['Nombre'].split(',')[0].strip()
    for item in data_2019
]
age_labels_2023 = [
    item['Nombre'].split(',')[0].strip()
    for item in data_2023
]

# 2. Hacer el conjunto único
unique_ages = sorted(set(age_labels_2019 + age_labels_2023))

# 3. Mostrarlos
print("Valores únicos de rango de edad en 2019 y 2023:")
for label in unique_ages:
    print("-", repr(label))

Valores únicos de rango de edad en 2019 y 2023:
- '18 y 19'
- '20 a 24'
- '25 a 29'
- '30 a 34'
- '35 a 39'
- '40 a 44'
- '45 a 49'
- '50 a 54'
- '55 a 59'
- '60 a 64'
- '65 a 69'
- '70 a 74'
- '75 a 79'
- '80 a 84'
- '85 y más'
- 'TOTAL'
- 'Total'


In [51]:
# Mapa de sexo
sex_map = {'Mujeres':'F', 'Hombres':'M', 'Total':'T'}

def process_censo_json(censo_json: list[dict], year: int) -> pd.DataFrame:
    rows = []
    for item in censo_json:
        label = item['Nombre']                   # e.g. "18 y 19, CER, Total"
        valor = item['Data'][0]['Valor']
        parts = [x.strip() for x in label.split(',')]
        if len(parts) != 3:
            continue
        age_range, censo_type, sexo_texto = parts

        # Sólo CER
        if censo_type.upper() != 'CER':
            continue

        # Mapear sexo
        sexo_key = sexo_texto.capitalize()
        if sexo_key not in sex_map:
            continue

        rows.append({
            'year':      year,
            'age_range':       age_range,                 # mantenemos el rango
            'sex':       sex_map[sexo_key],
            'census':    valor,
        })

    df = pd.DataFrame(rows)
    # Convertimos la columna age a categoría
    df['age_range'] = df['age_range'].str.title()
    df['age_range'] = df['age_range'].astype('category')
    return df

In [52]:
# Procesar 2019 y 2023 sin expandir
df19 = process_censo_json(data_2019, 2019)
df23 = process_censo_json(data_2023, 2023)

In [53]:
df19['age_range'].unique()

['Total', '18 Y 19', '20 A 24', '25 A 29', '30 A 34', ..., '65 A 69', '70 A 74', '75 A 79', '80 A 84', '85 Y Más']
Length: 16
Categories (16, object): ['18 Y 19', '20 A 24', '25 A 29', '30 A 34', ..., '75 A 79', '80 A 84', '85 Y Más', 'Total']

In [54]:
df19['age_range'].unique()

['Total', '18 Y 19', '20 A 24', '25 A 29', '30 A 34', ..., '65 A 69', '70 A 74', '75 A 79', '80 A 84', '85 Y Más']
Length: 16
Categories (16, object): ['18 Y 19', '20 A 24', '25 A 29', '30 A 34', ..., '75 A 79', '80 A 84', '85 Y Más', 'Total']

In [55]:
df_censo = pd.concat([df19, df23], ignore_index=True)

# Verificación
print(df_censo.dtypes)
print(df_censo['age_range'].cat.categories)

year            int64
age_range    category
sex            object
census        float64
dtype: object
Index(['18 Y 19', '20 A 24', '25 A 29', '30 A 34', '35 A 39', '40 A 44',
       '45 A 49', '50 A 54', '55 A 59', '60 A 64', '65 A 69', '70 A 74',
       '75 A 79', '80 A 84', '85 Y Más', 'Total'],
      dtype='object')


In [56]:
df_censo.head()

Unnamed: 0,year,age_range,sex,census
0,2019,Total,F,17996382.0
1,2019,Total,M,16874100.0
2,2019,Total,T,34870482.0
3,2019,18 Y 19,F,406085.0
4,2019,18 Y 19,M,429591.0


In [57]:
# Configurar conexión
usuario = "jalope"
contrasena = "admin"
host = "127.0.0.1"
puerto = "27250"

uri = f"mongodb://{quote_plus(usuario)}:{quote_plus(contrasena)}@{host}:{puerto}/?directConnection=true"
client = MongoClient(uri)
db = client["tfm_db"]

In [58]:
coll = db['INE_59777_33525_CENSO_RAW']
coll.drop()
coll.insert_many(df_censo.to_dict('records'))
print("Registros en la colección: ", coll.count_documents({}))
print("Número de filas de df_raw: ", len(df_censo))

Registros en la colección:  96
Número de filas de df_raw:  96
