## Data cleaning

### Set up

We first import the different libraries that we will be using for this project

In [169]:
import pandas as pd
from datetime import datetime 
from unidecode import unidecode


We import our dataset

In [170]:
data_path="data/fires-all.csv"
try:
    fires=pd.read_csv(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,id,superficie,fecha,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,municipio,...,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
0,1968290001,14.0,1968-01-01,,,0,4,29,0,INDETERMINADO,...,1,40,0,0,0,360,0,0,0,0
1,1968430003,3.0,1968-01-03,,,0,2,43,0,INDETERMINADO,...,1,0,0,0,0,60,0,0,0,0
2,1968290006,2.0,1968-01-06,,,0,4,29,0,INDETERMINADO,...,1,0,0,0,0,120,0,0,0,0
3,1968430016,600.0,1968-01-07,,,0,2,43,0,INDETERMINADO,...,1,20,0,0,0,1440,35,1,0,0
4,1968120007,8.2,1968-01-07,,,0,9,12,0,INDETERMINADO,...,1,20,0,0,0,120,0,0,0,0


We analyze the data and observe the type of data on each column and how many nulls values we have

In [171]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284589 entries, 0 to 284588
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               284589 non-null  int64  
 1   superficie       284589 non-null  float64
 2   fecha            284589 non-null  object 
 3   lat              230154 non-null  float64
 4   lng              230154 non-null  float64
 5   latlng_explicit  284589 non-null  int64  
 6   idcomunidad      284589 non-null  int64  
 7   idprovincia      284589 non-null  int64  
 8   idmunicipio      284589 non-null  int64  
 9   municipio        284576 non-null  object 
 10  causa            284589 non-null  int64  
 11  causa_supuesta   284589 non-null  int64  
 12  causa_desc       284589 non-null  int64  
 13  muertos          284589 non-null  int64  
 14  heridos          284589 non-null  int64  
 15  time_ctrl        284589 non-null  int64  
 16  time_ext         284589 non-null  int6

### Remove unnecesary columns

In [172]:
#drop
try:
    fires.drop(["id","causa","causa_supuesta","causa_desc",
                "muertos","heridos","time_ctrl","time_ext",
                "personal","medios","gastos","perdidas",
                "latlng_explicit"],axis=1, inplace=True)
except Exception as error:
    print(f"ERROR while droping the columns {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idcomunidad,idprovincia,idmunicipio,municipio
0,14.0,1968-01-01,,,4,29,0,INDETERMINADO
1,3.0,1968-01-03,,,2,43,0,INDETERMINADO
2,2.0,1968-01-06,,,4,29,0,INDETERMINADO
3,600.0,1968-01-07,,,2,43,0,INDETERMINADO
4,8.2,1968-01-07,,,9,12,0,INDETERMINADO


### Remove unnecesary rows

In [173]:
#We only want the fires of the comunidad 3 (Galicia)
fires=fires[fires["idcomunidad"]==3]
#We drop null values on lat and lng, beacsue we cannot find where the fire happend,
#  and it is older data from  1968
fires=fires.dropna(subset=["lat","lng"])
#We remove the column idcomunidad as it is no longer need it
fires.drop(["idcomunidad"],axis=1,inplace=True)
#print how many null values are for each column
fires.isna().sum(),
#TODO: from the coordinates lat and lng obtain the blanks municipio

(superficie     0
 fecha          0
 lat            0
 lng            0
 idprovincia    0
 idmunicipio    0
 municipio      5
 dtype: int64,)

In [174]:
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


In [175]:
#We save the data for the time series prediction
data_path="data/fires-time-series.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"Error while exporting the data to the excel file: {error}")

Error while exporting the data to the excel file: No module named 'openpyxl'


In [176]:
data_path="data/fires-time-series.xlsx"
try:
    fires_time_series=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Error while importing the excel file: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.


Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


### Añadir el tiempo para la prediccion con variables exogenas

#### Filtramos las fechas que sean mas antiguas del 2005

In [177]:
#Solo tenemos informacion a  partir del 2005 de meteorologia
#convertimos las fechas a tipo datetime (para una mejor compatibilidad)
fires['fecha'] = pd.to_datetime(fires['fecha'], errors='coerce')
#filtrar fechas que empiezen a partir del 2005 hasta el 2018
fires = fires.loc[fires['fecha'] >= '2009-01-01']
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
244077,1.0,2009-01-11,42.314177,-8.198452,32,18,CARBALLEDA DE AVIA
244079,5.0,2009-01-11,41.996375,-7.170321,32,34,"GUDIÑA, A"
244082,2.7,2009-01-11,42.333021,-8.314144,36,13,COVELO
244090,2.0,2009-01-12,41.995624,-7.174073,32,34,"GUDIÑA, A"
244094,11.1,2009-01-12,42.290851,-7.620661,32,43,MACEDA


In [178]:
valores_unicos = fires['idprovincia'].unique()
print(valores_unicos)
for valor in valores_unicos:
    print((fires['idprovincia'] == valor).sum())




[32 36 27 15]
4221
1349
1159
1608


In [179]:
fires[fires['idprovincia']==32].head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
244077,1.0,2009-01-11,42.314177,-8.198452,32,18,CARBALLEDA DE AVIA
244079,5.0,2009-01-11,41.996375,-7.170321,32,34,"GUDIÑA, A"
244090,2.0,2009-01-12,41.995624,-7.174073,32,34,"GUDIÑA, A"
244094,11.1,2009-01-12,42.290851,-7.620661,32,43,MACEDA
244227,1.5,2009-02-15,42.005325,-7.009491,32,48,"MEZQUITA, A"


In [180]:
#Juntar por semanas en la provincia de Ourense (id32)
# Convertir 'fecha' a datetime
fires['fecha'] = pd.to_datetime(fires['fecha'])

# Filtrar por la provincia 32
fires = fires[fires['idprovincia'] == 32]

fires.drop(["lat","lng","idmunicipio","municipio","idprovincia"],axis=1,inplace=True)

fires.head()

Unnamed: 0,superficie,fecha
244077,1.0,2009-01-11
244079,5.0,2009-01-11
244090,2.0,2009-01-12
244094,11.1,2009-01-12
244227,1.5,2009-02-15


In [181]:
# Convertir 'fecha' a formato datetime
fires["fecha"] = pd.to_datetime(fires["fecha"], format="%Y-%m-%d")

# Crear nuevas columnas para el año y la semana
fires['Anno'] = fires['fecha'].dt.year
fires['Semana'] = fires['fecha'].dt.isocalendar().week

fires = fires.groupby(["Anno", "Semana"]).agg({
    'superficie': 'sum',           # Suma de superficie
    'fecha': 'count'               # Cuenta de filas agrupadas (número de incendios)
}).rename(columns={'fecha': 'numero_incendios'}).round(2)

fires= fires.reset_index()
fires.head()

Unnamed: 0,Anno,Semana,superficie,numero_incendios
0,2009,2,6.0,2
1,2009,3,13.1,2
2,2009,7,1.5,1
3,2009,8,96.85,25
4,2009,9,227.66,64


In [182]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Anno              342 non-null    int32  
 1   Semana            342 non-null    UInt32 
 2   superficie        342 non-null    float64
 3   numero_incendios  342 non-null    int64  
dtypes: UInt32(1), float64(1), int32(1), int64(1)
memory usage: 8.5 KB


In [183]:
def normalize_header(header):
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'ü': 'u', 'ñ': 'n', 'Á': 'A', 'É': 'E', 'Í': 'I',
        'Ó': 'O', 'Ú': 'U', 'Ü': 'U', 'Ñ': 'N'
    }
    for spanish_char, english_char in replacements.items():
        header = header.replace(spanish_char, english_char)
    return header

#### Juntamos todos los datos en un unico csv por estacion meteo


In [184]:
#Definimos las estaciones de meteorologia
stations=["C01_A Capela_","C02_Boimorto_","LU01_Castro de Rei_","LU02_Monforte de Lemos_"]
# stations=["C01_A Capela_"]
station_data = {station: pd.DataFrame() for station in stations}

dataset_all = pd.DataFrame()

#Iteramos por todos los años que tenemos disponibles de datos meteorologicos
for year in range(2009,2019):
    #Actualizamos el data path
    data_path=f"data/{year}_{year+1}/"
    #Por cada estacion lo abrimos y lo juntamos
    for station in stations:
        try:
            station_path=data_path+f"{station}01_01_{year}_01_01_{year+1}.csv"
            aux = pd.read_csv(station_path, encoding="utf-16")

            aux.columns = [unidecode(col) for col in aux.columns]
            aux = aux[["Fecha", "Temp Media (oC)", "Temp Max (oC)", "Temp Minima (oC)" , "Humedad Media (%)", "Humedad Max (%)", "Humedad Min (%)", "Velviento (m/s)","DirViento (o)", "VelVientoMax (m/s)", "Radiacion (MJ/m2)", "Precipitacion (mm)"]]

            # dataset_all = pd.concat([dataset_all, aux], ignore_index=True)

            #Guardamos en station data
            station_data[station]=pd.concat([station_data[station],aux],ignore_index=True)
        except pd.errors.ParserError as parse_error:
            print(f"[ERROR]: Parser error when reading {station_path}: {parse_error}")
        except FileNotFoundError as file_error:
            print(f"[ERROR]: File not found: {station_path}")
        except Exception as general_error:
            print(f"[ERROR]: General error occurred while reading the file: {general_error}")
    print(f"Datos cargados para del año {year}")



Datos cargados para del año 2009
Datos cargados para del año 2010
Datos cargados para del año 2011
Datos cargados para del año 2012
Datos cargados para del año 2013
Datos cargados para del año 2014
Datos cargados para del año 2015
Datos cargados para del año 2016
Datos cargados para del año 2017
Datos cargados para del año 2018


In [185]:
# Imputacion 

aux = ["Temp Media (oC)", "Temp Max (oC)", "Temp Minima (oC)" , "Humedad Media (%)", "Humedad Max (%)", "Humedad Min (%)", "Velviento (m/s)","DirViento (o)", "VelVientoMax (m/s)", "Radiacion (MJ/m2)", "Precipitacion (mm)"]
#  Iteramos sobre cada DataFrame en el diccionario `station_data`
for aux_value in aux :
    for station, data in station_data.items():
        try:
            # Verificamos si el DataFrame no está vacío
            if not data.empty:
                
                 # Reemplazamos valores vacíos ("" o None) con NaN
                data.replace(["", None], pd.NA, inplace=True)
                if aux_value in data.columns:
                    # Calculamos la media ignorando los valores NaN
                    column_mean = data[aux_value].mean(skipna=True).round(4)

                    # Imputamos los valores vacíos (NaN) con la media calculada
                    data[aux_value] = data[aux_value].fillna(column_mean)

                    # Guardamos el DataFrame modificado de vuelta en el diccionario
                    station_data[station] = data

            else:
                print(f"El DataFrame de la estación {station} está vacío.")

        except Exception as error:
            print(f"[ERROR]: Error al procesar la estación {station}: {error}")

print("Imputación completada.")

Imputación completada.


In [None]:
# Iteramos sobre cada estación y sus respectivos datos
for station, data in station_data.items():
    try:
        # Convertimos la columna 'Fecha' a formato datetime
        data["Fecha"] = pd.to_datetime(data["Fecha"], format="%d/%m/%Y")
                
        # Creamos nuevas columnas para el año y la semana
        data['Anno'] = data['Fecha'].dt.year
        data['Semana'] = data['Fecha'].dt.isocalendar().week
        
        # Agrupamos por año y semana y calculamos la media y suvarianza
        weekly_data = data.groupby(['Anno', 'Semana']).agg({
            "Temp Media (oC)": ['mean', 'var'],
            "Temp Max (oC)": ['mean', 'var'],
            "Temp Minima (oC)": ['mean', 'var'],
            "Humedad Media (%)": ['mean', 'var'],
            "Humedad Max (%)": ['mean', 'var'],
            "Humedad Min (%)": ['mean', 'var'],
            "Velviento (m/s)": ['mean', 'var'],
            "DirViento (o)": ['mean', 'var'],
            "VelVientoMax (m/s)": ['mean', 'var'],
            "Precipitacion (mm)": ['mean', 'var']
        }).round(4)

        # Aplanamos los nombres de columnas
        weekly_data.columns = ['_'.join(col).strip() for col in weekly_data.columns.values]

        # Reset index para tener un DataFrame estándar
        weekly_data = weekly_data.reset_index()
        
        for column in weekly_data.columns:
            if column not in ['Anno', 'Semana']:
                weekly_data[f'{column}_Semana_Pasada'] = weekly_data.groupby('Anno')[column].shift(1)

        # Guardar datos semanales en el diccionario
        station_data[station] = weekly_data

        #station_data[station]['Valor_Anterior'] = station_data[station]['Temp Media (oC)_mean'].shift(1)

        print(f"Procesamiento semanal completo para {station}")
    
    except Exception as error:
        print(f"[ERROR]: Error al procesar los datos semanales para {station}: {error}")


Procesamiento semanal completo para C01_A Capela_
Procesamiento semanal completo para C02_Boimorto_
Procesamiento semanal completo para LU01_Castro de Rei_
Procesamiento semanal completo para LU02_Monforte de Lemos_


In [187]:

# Ahora, para cada estación, concatenamos todos los DataFrames y los guardamos en un archivo CSV único
# Iteramos sobre cada estación y sus respectivos datos en el diccionario station_data
for station, data in station_data.items():
    try:
        # Definimos el nombre del archivo de salida
        output_filename = f"data/estaciones/{station}01_01_2009_01_01_2019.csv"
        
        # Guardamos los datos concatenados en un archivo CSV
        data.to_csv(output_filename, index=False, encoding="utf-16")
        print(f"Archivo guardado para {station}: {output_filename}")
    
    except Exception as error:
        # Manejo de errores al guardar los datos
        print(f"[ERROR]: Error al guardar los datos para {station}: {error}")


Archivo guardado para C01_A Capela_: data/estaciones/C01_A Capela_01_01_2009_01_01_2019.csv
Archivo guardado para C02_Boimorto_: data/estaciones/C02_Boimorto_01_01_2009_01_01_2019.csv
Archivo guardado para LU01_Castro de Rei_: data/estaciones/LU01_Castro de Rei_01_01_2009_01_01_2019.csv
Archivo guardado para LU02_Monforte de Lemos_: data/estaciones/LU02_Monforte de Lemos_01_01_2009_01_01_2019.csv


In [188]:
weather_fires = pd.merge(station_data['LU02_Monforte de Lemos_'], fires, on=['Anno', 'Semana'], how='inner')

In [189]:
weather_fires.head()

Unnamed: 0,Anno,Semana,Temp Media (oC)_mean,Temp Media (oC)_var,Temp Max (oC)_mean,Temp Max (oC)_var,Temp Minima (oC)_mean,Temp Minima (oC)_var,Humedad Media (%)_mean,Humedad Media (%)_var,...,DirViento (o)_mean_Semana_Pasada,DirViento (o)_var_Semana_Pasada,VelVientoMax (m/s)_mean_Semana_Pasada,VelVientoMax (m/s)_var_Semana_Pasada,Radiacion (MJ/m2)_mean_Semana_Pasada,Radiacion (MJ/m2)_var_Semana_Pasada,Precipitacion (mm)_mean_Semana_Pasada,Precipitacion (mm)_var_Semana_Pasada,superficie,numero_incendios
0,2009,2,-0.01,5.6949,6.0243,1.2704,-4.7957,7.627,84.9143,64.8848,...,265.225,1311.6225,2.225,0.4458,3.8575,1.3088,2.15,3.2367,6.0,2
1,2009,3,5.2371,9.2971,10.8743,6.9375,0.2771,23.3576,83.9286,22.9157,...,168.9271,7319.5652,3.3657,2.5485,5.18,4.3407,0.0286,0.0057,13.1,2
2,2009,7,5.6329,6.489,14.1614,4.7573,0.7,8.8114,85.1714,32.3357,...,246.6571,2146.7429,5.58,1.953,4.6,2.8064,5.0286,68.859,1.5,1
3,2009,8,4.8671,0.5325,16.6,3.1034,-2.0043,1.3417,80.6857,16.3514,...,175.3143,2610.2414,3.78,7.1619,8.48,11.9917,1.3714,7.179,96.85,25
4,2009,9,7.7414,2.1202,18.7829,13.3328,0.7271,8.2046,77.8057,61.9749,...,148.2,806.8433,2.6914,0.3075,11.5143,3.4459,0.0857,0.0114,227.66,64


In [190]:
weather_fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 48 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Anno                                   342 non-null    int32  
 1   Semana                                 342 non-null    UInt32 
 2   Temp Media (oC)_mean                   342 non-null    float64
 3   Temp Media (oC)_var                    342 non-null    float64
 4   Temp Max (oC)_mean                     342 non-null    float64
 5   Temp Max (oC)_var                      342 non-null    float64
 6   Temp Minima (oC)_mean                  342 non-null    float64
 7   Temp Minima (oC)_var                   342 non-null    float64
 8   Humedad Media (%)_mean                 342 non-null    float64
 9   Humedad Media (%)_var                  342 non-null    float64
 10  Humedad Max (%)_mean                   342 non-null    float64
 11  Humeda

In [191]:
#We save the data with the weather
data_path="data/fires-weather.csv"
try:
    weather_fires.to_csv(data_path,index=False)
except Exception as error:
    print(f"ERror while exporting the data to the excel file: {error}")

### Split train and test data

In [220]:

# Dividir los datos en train y test
df_train = weather_fires[weather_fires['Anno'] <= 2015].reset_index(drop=True) #Modificados
df_test = weather_fires[weather_fires['Anno'] >= 2016].reset_index(drop=True)

df_train = pd.DataFrame(df_train)
df_test = pd.DataFrame(df_test,)


# Calcular tamaños de train y test
train_size = len(df_train)
test_size = len(df_test)
total_size = len(weather_fires)

# Calcular porcentajes
train_percentage = (train_size / total_size) * 100
test_percentage = (test_size / total_size) * 100

# Mostrar resultados
print(f"\nTamaño total: {total_size}")
print(f"Train: {train_size} filas ({train_percentage:.2f}%)")
print(f"Test: {test_size} filas ({test_percentage:.2f}%)")

# Verificar los resultados
print("Train:")
print(df_train.head(), "\n--------------\n", df_train.tail())
print("\nTest:")
print(df_test.head(), "\n--------------\n", df_test.tail()) # No se usará hasta el final

x_train = df_train.drop(columns=['superficie', 'numero_incendios'])
y_train = df_train['superficie']

x_test = df_test.drop(columns=['superficie', 'numero_incendios'])
y_test = df_test['superficie']

x_train.head()


Tamaño total: 342
Train: 242 filas (70.76%)
Test: 100 filas (29.24%)
Train:
   Anno  Semana  Temp Media (oC)_mean  Temp Media (oC)_var  \
0  2009       2               -0.0100               5.6949   
1  2009       3                5.2371               9.2971   
2  2009       7                5.6329               6.4890   
3  2009       8                4.8671               0.5325   
4  2009       9                7.7414               2.1202   

   Temp Max (oC)_mean  Temp Max (oC)_var  Temp Minima (oC)_mean  \
0              6.0243             1.2704                -4.7957   
1             10.8743             6.9375                 0.2771   
2             14.1614             4.7573                 0.7000   
3             16.6000             3.1034                -2.0043   
4             18.7829            13.3328                 0.7271   

   Temp Minima (oC)_var  Humedad Media (%)_mean  Humedad Media (%)_var  ...  \
0                7.6270                 84.9143                64.88

Unnamed: 0,Anno,Semana,Temp Media (oC)_mean,Temp Media (oC)_var,Temp Max (oC)_mean,Temp Max (oC)_var,Temp Minima (oC)_mean,Temp Minima (oC)_var,Humedad Media (%)_mean,Humedad Media (%)_var,...,Velviento (m/s)_mean_Semana_Pasada,Velviento (m/s)_var_Semana_Pasada,DirViento (o)_mean_Semana_Pasada,DirViento (o)_var_Semana_Pasada,VelVientoMax (m/s)_mean_Semana_Pasada,VelVientoMax (m/s)_var_Semana_Pasada,Radiacion (MJ/m2)_mean_Semana_Pasada,Radiacion (MJ/m2)_var_Semana_Pasada,Precipitacion (mm)_mean_Semana_Pasada,Precipitacion (mm)_var_Semana_Pasada
0,2009,2,-0.01,5.6949,6.0243,1.2704,-4.7957,7.627,84.9143,64.8848,...,0.4225,0.0305,265.225,1311.6225,2.225,0.4458,3.8575,1.3088,2.15,3.2367
1,2009,3,5.2371,9.2971,10.8743,6.9375,0.2771,23.3576,83.9286,22.9157,...,0.5871,0.1133,168.9271,7319.5652,3.3657,2.5485,5.18,4.3407,0.0286,0.0057
2,2009,7,5.6329,6.489,14.1614,4.7573,0.7,8.8114,85.1714,32.3357,...,1.11,0.3937,246.6571,2146.7429,5.58,1.953,4.6,2.8064,5.0286,68.859
3,2009,8,4.8671,0.5325,16.6,3.1034,-2.0043,1.3417,80.6857,16.3514,...,0.77,0.4966,175.3143,2610.2414,3.78,7.1619,8.48,11.9917,1.3714,7.179
4,2009,9,7.7414,2.1202,18.7829,13.3328,0.7271,8.2046,77.8057,61.9749,...,0.44,0.0045,148.2,806.8433,2.6914,0.3075,11.5143,3.4459,0.0857,0.0114


## Train models

In [208]:

!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ------ --------------------------------- 1.8/11.0 MB 10.1 MB/s eta 0:00:01
   -------------- ------------------------- 3.9/11.0 MB 9.8 MB/s eta 0:00:01
   --------------------- ------------------ 6.0/11.0 MB 9.7 MB/s eta 0:00:01
   ---------------------------- ----------- 7.9/11.0 MB 9.4 MB/s eta 0:00:01
   --------------------------------

### Analyze train data set

In [221]:

from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn import metrics

x_train = x_train.dropna()
y_train = y_train[x_train.index]

x_test = x_test.dropna()
y_test = y_test[x_test.index]
# Data split
inner_cv = TimeSeriesSplit(n_splits=3)

param_grid = {
    'knn__n_neighbors': [1,5,7,9,11,13,15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'] 
}

# MinMaxScaler----------------------------

pipeline_minmax = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsRegressor())
])

grid_minmax = GridSearchCV(pipeline_minmax,param_grid,cv=inner_cv, n_jobs=-1, scoring='neg_mean_absolute_error')

grid_minmax.fit(x_train,y_train)

print("Minmax: " + str(-grid_minmax.best_score_)+ " | Mejor hiperparámetro: "+ str(grid_minmax.best_params_))


# StandardScaler----------------------------

pipeline_std = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

grid_std = GridSearchCV(pipeline_std,param_grid,cv=inner_cv, n_jobs=-1,
                        scoring='neg_mean_absolute_error')

grid_std.fit(x_train,y_train)

print("Standard: " + str(-grid_std.best_score_)+ " | Mejor hiperparámetro: "+ str(grid_std.best_params_))


# RobustScaler----------------------------

pipeline_robust = Pipeline([
    ('scaler', RobustScaler()),
    ('knn', KNeighborsRegressor())
])

grid_robust = GridSearchCV(pipeline_robust,param_grid,cv=inner_cv, n_jobs=-1,
                            scoring='neg_mean_absolute_error')

grid_robust.fit(x_train,y_train)

print("Robust: " + str(-grid_robust.best_score_)+ " | Mejor hiperparámetro: "+ str(grid_robust.best_params_))

Minmax: 404.53768494983007 | Mejor hiperparámetro: {'knn__metric': 'manhattan', 'knn__n_neighbors': 15, 'knn__weights': 'distance'}
Standard: 390.4780790960452 | Mejor hiperparámetro: {'knn__metric': 'manhattan', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Robust: 378.85429378531074 | Mejor hiperparámetro: {'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'uniform'}


In [224]:
import time
knn_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('knn', KNeighborsRegressor())]
)

# KNN con hiperparámetros por omisión ---------------------------------------

start_time = time.time()

knn = cross_val_score(knn_pipe,x_train,y_train,scoring="neg_mean_absolute_error", cv=inner_cv)

end_time = time.time()

print(f"Average cross validation MAE is (omisión): {-knn.mean():.2f} ± {knn.std():.2f}")
print(f"Tiempo de ejecución: {(end_time - start_time):.4f} segundos.\n")

# KNN con ajuste de hiperparámetros ---------------------------------------
param_grid = {
    'knn__n_neighbors': [3,5,7,11,15,19,23,25],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['minkowski', 'euclidean', 'manhattan'] 
}

start_time = time.time()

knn_hpo = GridSearchCV(knn_pipe, param_grid, scoring='neg_mean_absolute_error', cv=inner_cv, n_jobs=-1, verbose=1)
knn_hpo.fit(x_train, y_train)

end_time = time.time()


print(f"Mejor puntuación MAE: ", -knn_hpo.best_score_)
print("Mejores hiperparámetros:", knn_hpo.best_params_)
print(f"Tiempo de ejecución: {(end_time - start_time):.4f} segundos.")

Average cross validation MAE is (omisión): 428.98 ± 192.96
Tiempo de ejecución: 0.0685 segundos.

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Mejor puntuación MAE:  378.85429378531074
Mejores hiperparámetros: {'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'uniform'}
Tiempo de ejecución: 0.1726 segundos.


In [225]:
from sklearn.tree import DecisionTreeRegressor

tree_pipe = Pipeline([
    ('tree', DecisionTreeRegressor(random_state=100472313))
])

# Árbol de regresión por omisión --------------------------

start_time = time.time()
tree = cross_val_score(tree_pipe,x_train,y_train,scoring="neg_mean_absolute_error", cv=inner_cv)
end_time= time.time()

print(f"Average cross validation MAE is (omisión): {-tree.mean():.2f} ± {tree.std():.2f}")
print(f"Tomó {(end_time - start_time):.4f} segundos.")



#Árboles de regresión con HPO -----------------------------

param_grid = {
    'tree__criterion':['squared_error', 'friedman_mse', 'absolute_error'],
    'tree__max_depth': [7,9,11],
    'tree__min_samples_split': [40 ,50, 70, 90, 110, 130],
}

start_time = time.time()

tree_hpo = GridSearchCV(tree_pipe, param_grid, scoring='neg_mean_absolute_error', cv=inner_cv, n_jobs=-1, verbose=1)
tree_hpo.fit(x_train, y_train)

end_time = time.time()

print(f"Mejor puntuación MAE: ", -tree_hpo.best_score_)
print("Mejores hiperparámetros:", tree_hpo.best_params_)
print(f"Tiempo de ejecución: {(end_time - start_time):.4f} segundos.")


Average cross validation MAE is (omisión): 638.22 ± 162.10
Tomó 0.0481 segundos.
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Mejor puntuación MAE:  320.1911581920904
Mejores hiperparámetros: {'tree__criterion': 'absolute_error', 'tree__max_depth': 7, 'tree__min_samples_split': 130}
Tiempo de ejecución: 0.4492 segundos.


### Time series

In [196]:
#TODO

### Con variables exogenas

In [197]:
#TODO

## Performances

In [198]:
#TODO

## Test Model


In [199]:
#TODO