In [1]:
import pandas as pd
import requests
import urllib
import json
import time

# 1. IMPORT DATA

### En primer lugar, importamos el archivo que contiene los datos necesarios para entrenar los distintos modelos de Machine Learning que en un futuro utilizaremos.

In [2]:
historic_traffic_data = pd.read_csv(filepath_or_buffer='data/04_2025.csv', sep=';')
historic_traffic_data.head()

Unnamed: 0,id,fecha,tipo_elem,intensidad,ocupacion,carga,vmed,error,periodo_integracion
0,1001,01/04/2025 9:45,C30,2260,8,0,57.0,N,3
1,1001,01/04/2025 10:00,C30,2172,7,0,59.0,N,5
2,1001,01/04/2025 10:15,C30,1680,6,0,54.0,N,2
3,1002,01/04/2025 0:00,C30,0,0,0,0.0,N,5
4,1002,01/04/2025 0:15,C30,0,0,0,0.0,N,5


### A continuación, importamos el .csv que contiene los datos de los diferentes puntos de medida de nivel de tráfico en Madrid. De dicho fichero, hay que extraer dos columnas que las añadiremos al primer DataFrame.

In [3]:
columns = ['distrito', 'id']
measure_points_data = pd.read_csv(filepath_or_buffer='data/pmed_ubicacion_04_2025.csv', sep=';', usecols=columns, encoding='utf-8')
measure_points_data.head()

Unnamed: 0,distrito,id
0,8.0,6639
1,8.0,3797
2,5.0,6640
3,5.0,6642
4,5.0,6643


### Antes de añadir dichas columnas al DataFrame principal, estudiamos la presencia de valores nulos o en blanco en measure_points_data:

In [4]:
measure_points_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4973 entries, 0 to 4972
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   distrito  4968 non-null   float64
 1   id        4973 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 77.8 KB


### La columna 'distrito' posee 5 valores nulos, por lo que procederemos a eliminar dichas filas.

In [5]:
measure_points_data.dropna(inplace=True)
measure_points_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4968 entries, 0 to 4972
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   distrito  4968 non-null   float64
 1   id        4968 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 116.4 KB


### Unimos ambos DataFrames, sabiendo que ambas columnas 'id' de ambos DataFrame representan lo mismo. La adición de la columna 'distrito' al primer DataFrame nos ayudará posteriormente a llevar a cabo agrupaciones o ver en qué distrito se genera una mayor cantidad de atascos. 

In [41]:
data = historic_traffic_data.merge(measure_points_data, left_on='id', right_on='id')
data

Unnamed: 0,id,fecha,tipo_elem,intensidad,ocupacion,carga,vmed,error,periodo_integracion,distrito
0,1001,01/04/2025 9:45,C30,2260,8,0,57.0,N,3,10.0
1,1001,01/04/2025 10:00,C30,2172,7,0,59.0,N,5,10.0
2,1001,01/04/2025 10:15,C30,1680,6,0,54.0,N,2,10.0
3,1002,01/04/2025 0:00,C30,0,0,0,0.0,N,5,10.0
4,1002,01/04/2025 0:15,C30,0,0,0,0.0,N,5,10.0
...,...,...,...,...,...,...,...,...,...,...
1048570,3805,07/04/2025 12:45,URB,707,2,29,0.0,N,15,13.0
1048571,3805,07/04/2025 13:00,URB,703,3,32,0.0,N,15,13.0
1048572,3805,07/04/2025 13:15,URB,822,4,35,0.0,N,15,13.0
1048573,3805,07/04/2025 13:30,URB,679,3,35,0.0,N,15,13.0


### Al DataFrame principal vamos a añadirle ahora datos sobre las precipitaciones que tuvieron lugar cada día. Para ello, hacemos uso de la API de AEMET.

In [7]:
import os
from dotenv import load_dotenv

def get_data_from_aemet() -> pd.DataFrame:

    load_dotenv()

    main_dataframe = pd.DataFrame()

    # La franja de tiempo máxima para recolectar datos es de 6 meses. 
    dates = ['2013-07-01T00:00:00UTC', '2014-01-01T00:00:00UTC', '2014-07-01T00:00:00UTC', '2015-01-01T00:00:00UTC', '2015-07-01T00:00:00UTC',
            '2016-01-01T00:00:00UTC', '2016-07-01T00:00:00UTC', '2017-01-01T00:00:00UTC', '2017-07-01T00:00:00UTC', '2018-01-01T00:00:00UTC',
            '2018-07-01T00:00:00UTC', '2019-01-01T00:00:00UTC', '2019-07-01T00:00:00UTC', '2020-01-01T00:00:00UTC', '2020-07-01T00:00:00UTC',
            '2021-01-01T00:00:00UTC', '2021-07-01T00:00:00UTC', '2022-01-01T00:00:00UTC', '2022-07-01T00:00:00UTC', '2023-01-01T00:00:00UTC',
            '2023-07-01T00:00:00UTC', '2024-01-01T00:00:00UTC', '2024-07-01T00:00:00UTC', '2025-01-01T00:00:00UTC', '2025-04-30T00:00:00UTC']

    for i in range(len(dates)-1):
        base_url = 'https://opendata.aemet.es/opendata'
        fechaIniStr = dates[i]
        fechaFinStr = dates[i+1]
        idema = '3195' # Estación
        API_KEY = os.getenv('AEMET_API_KEY')

        endpoint = f"/api/valores/climatologicos/diarios/datos/fechaini/{fechaIniStr}/fechafin/{fechaFinStr}/estacion/{idema}"
        url = base_url + endpoint

        headers = {
            "Accept": "application/json",
            "api_key": API_KEY
        }

        try:
            response = requests.get(url, headers=headers, timeout=15)
            data = response.json()

            if data.get('datos') is not None:
                weather_data = data['datos']
                file = urllib.request.urlopen(weather_data)
                file_content = file.read()
                weather_data_json = json.loads(file_content)
                secondary_dataframe = pd.DataFrame(weather_data_json)
                # Eliminamos duplicados porque las fechas de los extremos se cogen dos veces cada una salvo la primera y la última
                main_dataframe = pd.concat([main_dataframe, secondary_dataframe], ignore_index=True).drop_duplicates()
                secondary_dataframe = secondary_dataframe.fillna(0)
                time.sleep(1)
            else:
                print(f"No se encontró la clave 'datos' en la respuesta para fechas {fechaIniStr} a {fechaFinStr}")
                print(f"Error {response.status_code}: {response.text}")
        except Exception as e:
            print(f"Error durante la solicitud para {fechaIniStr} a {fechaFinStr}: {e}")

    return main_dataframe

In [8]:
aemet_dataframe = get_data_from_aemet()
aemet_dataframe

Unnamed: 0,fecha,indicativo,nombre,provincia,altitud,tmed,prec,tmin,horatmin,tmax,...,horaracha,presMax,horaPresMax,presMin,horaPresMin,hrMedia,hrMax,horaHrMax,hrMin,horaHrMin
0,2013-07-01,3195,"MADRID, RETIRO",MADRID,667,274,00,205,Varias,342,...,17:05,9402,07,9366,19,37,50,07:00,26,15:30
1,2013-07-02,3195,"MADRID, RETIRO",MADRID,667,262,00,200,06:15,323,...,15:00,9411,09,9378,18,39,58,07:00,24,17:00
2,2013-07-03,3195,"MADRID, RETIRO",MADRID,667,266,00,186,04:40,347,...,23:30,9431,24,9402,00,37,62,04:40,26,Varias
3,2013-07-04,3195,"MADRID, RETIRO",MADRID,667,284,00,214,06:15,355,...,22:40,9451,24,9419,18,39,63,05:00,26,Varias
4,2013-07-05,3195,"MADRID, RETIRO",MADRID,667,296,00,220,06:10,372,...,00:10,9462,07,9430,18,35,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4317,2025-04-26,3195,"MADRID, RETIRO",MADRID,667,167,00,118,05:20,216,...,21:50,9439,23,9372,02,48,78,05:00,31,18:00
4318,2025-04-27,3195,"MADRID, RETIRO",MADRID,667,152,00,90,05:40,214,...,11:50,9446,09,9418,18,47,71,Varias,34,14:50
4319,2025-04-28,3195,"MADRID, RETIRO",MADRID,667,171,00,107,06:00,235,...,23:00,9434,00,9389,18,45,73,05:40,28,14:20
4320,2025-04-29,3195,"MADRID, RETIRO",MADRID,667,170,00,118,06:00,223,...,14:10,9409,00,9360,16,52,69,06:40,38,14:10


### Ambos dataframe tienen 'fecha' como columna común, por lo que ambos dataframe se unirán utilizando dicha columna como clave, pero para poder realizar dicha unión tenemos que llevar a cabo ciertas transformaciones en ambos dataframes.

In [42]:
# DATAFRAME DATA -> Separar la columna fecha en fecha y hora y tipar correctamente ambas columnas resultantes.

data[['fecha', 'hora']] = data['fecha'].str.split(' ', expand=True)
data['fecha'] = pd.to_datetime(data['fecha'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')
data['hora'] = pd.to_datetime(data['hora'], format='%H:%M').dt.time
data


Unnamed: 0,id,fecha,tipo_elem,intensidad,ocupacion,carga,vmed,error,periodo_integracion,distrito,hora
0,1001,01/04/2025,C30,2260,8,0,57.0,N,3,10.0,09:45:00
1,1001,01/04/2025,C30,2172,7,0,59.0,N,5,10.0,10:00:00
2,1001,01/04/2025,C30,1680,6,0,54.0,N,2,10.0,10:15:00
3,1002,01/04/2025,C30,0,0,0,0.0,N,5,10.0,00:00:00
4,1002,01/04/2025,C30,0,0,0,0.0,N,5,10.0,00:15:00
...,...,...,...,...,...,...,...,...,...,...,...
1048570,3805,07/04/2025,URB,707,2,29,0.0,N,15,13.0,12:45:00
1048571,3805,07/04/2025,URB,703,3,32,0.0,N,15,13.0,13:00:00
1048572,3805,07/04/2025,URB,822,4,35,0.0,N,15,13.0,13:15:00
1048573,3805,07/04/2025,URB,679,3,35,0.0,N,15,13.0,13:30:00


In [None]:
# DATAFRAME AEMET -> 
aemet_dataframe['fecha'] = pd.to_datetime(aemet_dataframe['fecha']).dt.strftime('%d/%m/%Y')

Unnamed: 0,fecha,indicativo,nombre,provincia,altitud,tmed,prec,tmin,horatmin,tmax,...,horaracha,presMax,horaPresMax,presMin,horaPresMin,hrMedia,hrMax,horaHrMax,hrMin,horaHrMin
0,01/07/2013,3195,"MADRID, RETIRO",MADRID,667,274,00,205,Varias,342,...,17:05,9402,07,9366,19,37,50,07:00,26,15:30
1,02/07/2013,3195,"MADRID, RETIRO",MADRID,667,262,00,200,06:15,323,...,15:00,9411,09,9378,18,39,58,07:00,24,17:00
2,03/07/2013,3195,"MADRID, RETIRO",MADRID,667,266,00,186,04:40,347,...,23:30,9431,24,9402,00,37,62,04:40,26,Varias
3,04/07/2013,3195,"MADRID, RETIRO",MADRID,667,284,00,214,06:15,355,...,22:40,9451,24,9419,18,39,63,05:00,26,Varias
4,05/07/2013,3195,"MADRID, RETIRO",MADRID,667,296,00,220,06:10,372,...,00:10,9462,07,9430,18,35,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4317,26/04/2025,3195,"MADRID, RETIRO",MADRID,667,167,00,118,05:20,216,...,21:50,9439,23,9372,02,48,78,05:00,31,18:00
4318,27/04/2025,3195,"MADRID, RETIRO",MADRID,667,152,00,90,05:40,214,...,11:50,9446,09,9418,18,47,71,Varias,34,14:50
4319,28/04/2025,3195,"MADRID, RETIRO",MADRID,667,171,00,107,06:00,235,...,23:00,9434,00,9389,18,45,73,05:40,28,14:20
4320,29/04/2025,3195,"MADRID, RETIRO",MADRID,667,170,00,118,06:00,223,...,14:10,9409,00,9360,16,52,69,06:40,38,14:10


In [47]:
required_columns = aemet_dataframe[['fecha', 'prec']]
required_columns

Unnamed: 0,fecha,prec
0,01/07/2013,00
1,02/07/2013,00
2,03/07/2013,00
3,04/07/2013,00
4,05/07/2013,00
...,...,...
4317,26/04/2025,00
4318,27/04/2025,00
4319,28/04/2025,00
4320,29/04/2025,00


In [48]:
data = data.merge(required_columns, left_on='fecha', right_on='fecha')
data

Unnamed: 0,id,fecha,tipo_elem,intensidad,ocupacion,carga,vmed,error,periodo_integracion,distrito,hora,prec
0,1001,01/04/2025,C30,2260,8,0,57.0,N,3,10.0,09:45:00,00
1,1001,01/04/2025,C30,2172,7,0,59.0,N,5,10.0,10:00:00,00
2,1001,01/04/2025,C30,1680,6,0,54.0,N,2,10.0,10:15:00,00
3,1002,01/04/2025,C30,0,0,0,0.0,N,5,10.0,00:00:00,00
4,1002,01/04/2025,C30,0,0,0,0.0,N,5,10.0,00:15:00,00
...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,3805,07/04/2025,URB,707,2,29,0.0,N,15,13.0,12:45:00,00
1048571,3805,07/04/2025,URB,703,3,32,0.0,N,15,13.0,13:00:00,00
1048572,3805,07/04/2025,URB,822,4,35,0.0,N,15,13.0,13:15:00,00
1048573,3805,07/04/2025,URB,679,3,35,0.0,N,15,13.0,13:30:00,00
