## Data cleaning

### Set up

We first import the different libraries that we will be using for this project

In [15]:
import pandas as pd
from datetime import datetime 
from unidecode import unidecode


We import our dataset

In [2]:
data_path="data/fires-all.csv"
try:
    fires=pd.read_csv(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,id,superficie,fecha,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,municipio,...,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
0,1968290001,14.0,1968-01-01,,,0,4,29,0,INDETERMINADO,...,1,40,0,0,0,360,0,0,0,0
1,1968430003,3.0,1968-01-03,,,0,2,43,0,INDETERMINADO,...,1,0,0,0,0,60,0,0,0,0
2,1968290006,2.0,1968-01-06,,,0,4,29,0,INDETERMINADO,...,1,0,0,0,0,120,0,0,0,0
3,1968430016,600.0,1968-01-07,,,0,2,43,0,INDETERMINADO,...,1,20,0,0,0,1440,35,1,0,0
4,1968120007,8.2,1968-01-07,,,0,9,12,0,INDETERMINADO,...,1,20,0,0,0,120,0,0,0,0


We analyze the data and observe the type of data on each column and how many nulls values we have

In [3]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284589 entries, 0 to 284588
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               284589 non-null  int64  
 1   superficie       284589 non-null  float64
 2   fecha            284589 non-null  object 
 3   lat              230154 non-null  float64
 4   lng              230154 non-null  float64
 5   latlng_explicit  284589 non-null  int64  
 6   idcomunidad      284589 non-null  int64  
 7   idprovincia      284589 non-null  int64  
 8   idmunicipio      284589 non-null  int64  
 9   municipio        284576 non-null  object 
 10  causa            284589 non-null  int64  
 11  causa_supuesta   284589 non-null  int64  
 12  causa_desc       284589 non-null  int64  
 13  muertos          284589 non-null  int64  
 14  heridos          284589 non-null  int64  
 15  time_ctrl        284589 non-null  int64  
 16  time_ext         284589 non-null  int6

### Remove unnecesary columns

In [4]:
#drop
try:
    fires.drop(["id","causa","causa_supuesta","causa_desc",
                "muertos","heridos","time_ctrl","time_ext",
                "personal","medios","gastos","perdidas",
                "latlng_explicit"],axis=1, inplace=True)
except Exception as error:
    print(f"ERROR while droping the columns {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idcomunidad,idprovincia,idmunicipio,municipio
0,14.0,1968-01-01,,,4,29,0,INDETERMINADO
1,3.0,1968-01-03,,,2,43,0,INDETERMINADO
2,2.0,1968-01-06,,,4,29,0,INDETERMINADO
3,600.0,1968-01-07,,,2,43,0,INDETERMINADO
4,8.2,1968-01-07,,,9,12,0,INDETERMINADO


### Remove unnecesary rows

In [5]:
#We only want the fires of the comunidad 3 (Galicia)
fires=fires[fires["idcomunidad"]==3]
#We drop null values on lat and lng, beacsue we cannot find where the fire happend,
#  and it is older data from  1968
fires=fires.dropna(subset=["lat","lng"])
#We remove the column idcomunidad as it is no longer need it
fires.drop(["idcomunidad"],axis=1,inplace=True)
#print how many null values are for each column
fires.isna().sum(),
#TODO: from the coordinates lat and lng obtain the blanks municipio

(superficie     0
 fecha          0
 lat            0
 lng            0
 idprovincia    0
 idmunicipio    0
 municipio      5
 dtype: int64,)

In [6]:
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


In [7]:
#We save the data for the time series prediction
data_path="data/fires-time-series.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"Error while exporting the data to the excel file: {error}")

Error while exporting the data to the excel file: No module named 'openpyxl'


In [8]:
data_path="data/fires-time-series.xlsx"
try:
    fires_time_series=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Error while importing the excel file: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.


Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


### Añadir el tiempo para la prediccion con variables exogenas

#### Filtramos las fechas que sean mas antiguas del 2005

In [9]:
#Solo tenemos informacion a  partir del 2005 de meteorologia
#convertimos las fechas a tipo datetime (para una mejor compatibilidad)
fires['fecha'] = pd.to_datetime(fires['fecha'], errors='coerce')
#filtrar fechas que empiezen a partir del 2005 hasta el 2018
fires = fires.loc[fires['fecha'] >= '2005-01-01']
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
221698,5.0,2005-01-06,42.011805,-8.026783,32,41,LOBEIRA
221705,1.0,2005-01-06,42.525157,-8.123135,32,35,"IRIXO, O"
221709,1.0,2005-01-06,42.33853,-6.787814,32,17,CARBALLEDA DE VALDEORRAS
221713,1.2,2005-01-07,42.113543,-8.180216,32,56,PADRENDA
221718,1.5,2005-01-07,42.304143,-8.241844,32,46,MELÓN


In [54]:
def normalize_header(header):
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'ü': 'u', 'ñ': 'n', 'Á': 'A', 'É': 'E', 'Í': 'I',
        'Ó': 'O', 'Ú': 'U', 'Ü': 'U', 'Ñ': 'N'
    }
    for spanish_char, english_char in replacements.items():
        header = header.replace(spanish_char, english_char)
    return header

#### Juntamos todos los datos en un unico csv por estacion meteo


In [None]:
#Definimos las estaciones de meteorologia
# stations=["C01_A Capela_","C02_Boimorto_","LU01_Castro de Rei_","LU02_Monforte de Lemos_"]
stations=["C01_A Capela_"]
station_data = {station: pd.DataFrame() for station in stations}

dataset_all = pd.DataFrame()

#Iteramos por todos los años que tenemos disponibles de datos meteorologicos
for year in range(2009,2019):
    #Actualizamos el data path
    data_path=f"data/{year}_{year+1}/"
    #Por cada estacion lo abrimos y lo juntamos
    for station in stations:
        try:
            station_path=data_path+f"{station}01_01_{year}_01_01_{year+1}.csv"
            # if year in [2005, 2006, 2007, 2008]:
            #     with open(station_path, 'r', encoding='utf-8') as file:
            #         headers = file.readline().strip().split(',')
            #         normalized_headers = [normalize_header(header) for header in headers]
            aux = pd.read_csv(station_path, encoding="utf-16")
            aux.columns = [unidecode(col) for col in aux.columns]
            # aux = aux[["Fecha", "Temp Media (ºC)", "Temp Max (ºC)", "Temp Mínima (ºC)" , "Humedad Media (%)", "Humedad Max (%)", "Humedad Min (%)", "Velviento (m/s)","DirViento (º)", "VelVientoMax (m/s)", "Radiación (MJ/m2)", "Precipitación (mm)"]]

            dataset_all = pd.concat([dataset_all, aux], ignore_index=True)

            #Guardamos en station data
            station_data[station]=pd.concat([station_data[station],aux],ignore_index=True)
            
        except pd.errors.ParserError as parse_error:
            print(f"Parser error when reading {station_path}: {parse_error}")
        except FileNotFoundError as file_error:
            print(f"File not found: {station_path}")
        except Exception as general_error:
            print(f"General error occurred while reading the file: {general_error}")
    print(f"Datos cargados para del año {year}")



General error occurred while reading the file: "['Temp Media (ºC)', 'Temp Max (ºC)', 'Temp Mínima (ºC)', 'DirViento (º)', 'Radiación (MJ/m2)', 'Precipitación (mm)'] not in index"
Datos cargados para del año 2009
General error occurred while reading the file: "['Temp Media (ºC)', 'Temp Max (ºC)', 'Temp Mínima (ºC)', 'DirViento (º)', 'Radiación (MJ/m2)', 'Precipitación (mm)'] not in index"
Datos cargados para del año 2010
General error occurred while reading the file: "['Temp Media (ºC)', 'Temp Max (ºC)', 'Temp Mínima (ºC)', 'DirViento (º)', 'Radiación (MJ/m2)', 'Precipitación (mm)'] not in index"
Datos cargados para del año 2011
General error occurred while reading the file: "['Temp Media (ºC)', 'Temp Max (ºC)', 'Temp Mínima (ºC)', 'DirViento (º)', 'Radiación (MJ/m2)', 'Precipitación (mm)'] not in index"
Datos cargados para del año 2012
General error occurred while reading the file: "['Temp Media (ºC)', 'Temp Max (ºC)', 'Temp Mínima (ºC)', 'DirViento (º)', 'Radiación (MJ/m2)', 'Precipit

In [58]:
dataset_all.to_csv("archivo_exportado.csv", index=False)

In [40]:
station_data["C01_A Capela_"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3643 entries, 0 to 3642
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   IdProvincia             3643 non-null   int64  
 1   IdEstacion              3643 non-null   int64  
 2   Fecha                   3643 non-null   object 
 3   Ano                     3643 non-null   int64  
 4   Dia                     3643 non-null   int64  
 5   Temp Media (oC)         3638 non-null   float64
 6   Temp Max (oC)           3634 non-null   float64
 7   Hora Temp Max           3635 non-null   object 
 8   Temp Minima (oC)        3631 non-null   float64
 9   Hora Temp Min           3635 non-null   object 
 10  Humedad Media (%)       3627 non-null   float64
 11  Humedad Max (%)         3623 non-null   float64
 12  Hora Hum Max            3626 non-null   object 
 13  Humedad Min (%)         3619 non-null   float64
 14  Hora Hum Min            3626 non-null   

In [None]:

# Ahora, para cada estación, concatenamos todos los DataFrames y los guardamos en un archivo CSV único
for station, dataframes in station_data.items():
    try:
        # Concatenar todos los DataFrames de la estación
        compiled_df = pd.concat(dataframes, ignore_index=True)
        
        # Guardar el archivo final concatenado para la estación
        output_filename = f"data/estaciones/{station}01_01_2005_01_01_2019.csv"
        compiled_df.to_csv(output_filename, index=False)
        print(f"Archivo guardado para {station}: {output_filename}")
    
    except Exception as error:
        print(f"Error al guardar los datos para {station}: {error}")

Archivo guardado para C01_A Capela_: data/estaciones/C01_A Capela_01_01_2005_01_01_2019.csv
Archivo guardado para C02_Boimorto_: data/estaciones/C02_Boimorto_01_01_2005_01_01_2019.csv
Archivo guardado para LU01_Castro de Rei_: data/estaciones/LU01_Castro de Rei_01_01_2005_01_01_2019.csv
Archivo guardado para LU02_Monforte de Lemos_: data/estaciones/LU02_Monforte de Lemos_01_01_2005_01_01_2019.csv


In [None]:
fires.isna().sum()

superficie     0
fecha          0
lat            0
lng            0
idprovincia    0
idmunicipio    0
municipio      5
dtype: int64

In [None]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15085 entries, 221698 to 281994
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   superficie   15085 non-null  float64       
 1   fecha        15085 non-null  datetime64[ns]
 2   lat          15085 non-null  float64       
 3   lng          15085 non-null  float64       
 4   idprovincia  15085 non-null  int64         
 5   idmunicipio  15085 non-null  int64         
 6   municipio    15080 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 942.8+ KB


In [None]:
#We save the data with the weather
data_path="data/fires-weather.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"ERror while exporting the data to the excel file: {error}")

In [None]:
#This si done to avoid repeating teh fetching of weather that is an intensive task
data_path="data/fires-weather.xlsx"
try:
    fires=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
0,5.0,2005-01-06,42.011805,-8.026783,32,41,LOBEIRA
1,1.0,2005-01-06,42.525157,-8.123135,32,35,"IRIXO, O"
2,1.0,2005-01-06,42.33853,-6.787814,32,17,CARBALLEDA DE VALDEORRAS
3,1.2,2005-01-07,42.113543,-8.180216,32,56,PADRENDA
4,1.5,2005-01-07,42.304143,-8.241844,32,46,MELÓN


### Group by weeks

In [None]:
#TODO 

### Split train and test data

In [None]:
#We split 80% of the data for training the model
train_split=80


## Train models

### Analyze train data set

In [None]:
#TODO

### Time series

In [None]:
#TODO

### Con variables exogenas

In [None]:
#TODO

## Performances

In [None]:
#TODO

## Test Model


In [None]:
#TODO