## Data cleaning

### Set up

We first import the different libraries that we will be using for this project

In [1]:
import pandas as pd
from datetime import datetime 
from meteostat import Point, Daily, Stations

We import our dataset

In [2]:
data_path="data/fires-all.csv"
try:
    fires=pd.read_csv(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,id,superficie,fecha,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,municipio,...,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
0,1968290001,14.0,1968-01-01,,,0,4,29,0,INDETERMINADO,...,1,40,0,0,0,360,0,0,0,0
1,1968430003,3.0,1968-01-03,,,0,2,43,0,INDETERMINADO,...,1,0,0,0,0,60,0,0,0,0
2,1968290006,2.0,1968-01-06,,,0,4,29,0,INDETERMINADO,...,1,0,0,0,0,120,0,0,0,0
3,1968430016,600.0,1968-01-07,,,0,2,43,0,INDETERMINADO,...,1,20,0,0,0,1440,35,1,0,0
4,1968120007,8.2,1968-01-07,,,0,9,12,0,INDETERMINADO,...,1,20,0,0,0,120,0,0,0,0


We analyze the data and observe the type of data on each column and how many nulls values we have

In [3]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284589 entries, 0 to 284588
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               284589 non-null  int64  
 1   superficie       284589 non-null  float64
 2   fecha            284589 non-null  object 
 3   lat              230154 non-null  float64
 4   lng              230154 non-null  float64
 5   latlng_explicit  284589 non-null  int64  
 6   idcomunidad      284589 non-null  int64  
 7   idprovincia      284589 non-null  int64  
 8   idmunicipio      284589 non-null  int64  
 9   municipio        284576 non-null  object 
 10  causa            284589 non-null  int64  
 11  causa_supuesta   284589 non-null  int64  
 12  causa_desc       284589 non-null  int64  
 13  muertos          284589 non-null  int64  
 14  heridos          284589 non-null  int64  
 15  time_ctrl        284589 non-null  int64  
 16  time_ext         284589 non-null  int6

### Remove unnecesary columns

In [4]:
#drop
try:
    fires.drop(["id","causa","causa_supuesta","causa_desc",
                "muertos","heridos","time_ctrl","time_ext",
                "personal","medios","gastos","perdidas",
                "latlng_explicit"],axis=1, inplace=True)
except Exception as error:
    print(f"ERROR while droping the columns {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idcomunidad,idprovincia,idmunicipio,municipio
0,14.0,1968-01-01,,,4,29,0,INDETERMINADO
1,3.0,1968-01-03,,,2,43,0,INDETERMINADO
2,2.0,1968-01-06,,,4,29,0,INDETERMINADO
3,600.0,1968-01-07,,,2,43,0,INDETERMINADO
4,8.2,1968-01-07,,,9,12,0,INDETERMINADO


### Remove unnecesary rows

In [5]:
#We only want the fires of the comunidad 3 (Galicia)
fires=fires[fires["idcomunidad"]==3]
#We drop null values on lat and lng, beacsue we cannot find where the fire happend,
#  and it is older data from  1968
fires=fires.dropna(subset=["lat","lng"])
#We remove the column idcomunidad as it is no longer need it
fires.drop(["idcomunidad"],axis=1,inplace=True)
#print how many null values are for each column
fires.isna().sum(),
#TODO: from the coordinates lat and lng obtain the blanks municipio

(superficie     0
 fecha          0
 lat            0
 lng            0
 idprovincia    0
 idmunicipio    0
 municipio      5
 dtype: int64,)

In [6]:
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


In [None]:
#We save the data for the time series prediction
data_path="data/fires-time-series.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"Error while exporting the data to the excel file: {error}")

In [8]:
#This si done to avoid repeating the fetching of weather that is an intensive task
data_path="data/fires-time-series.xlsx"
try:
    fires_time_series=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


### Añadir el tiempo para la prediccion con variables exogenas

#### Filtramos las fechas que sean mas antiguas que el 2005

In [None]:
#Solo tenemos informacion a  partir del 2005 de meteorologia
#convertimos las fechas a tipo datetime (para una mejor compatibilidad)
fires['fecha'] = pd.to_datetime(fires['fecha'], errors='coerce')
#filtrar fechas que empiezen a partir del 2005 hasta el 2018
fires = fires.loc[fires['fecha'] >= '2005-01-01']

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
14157,17.00,2017-01-01,42.184347,-6.966490,32,86,VIANA DO BOLO
14158,1.83,2017-01-01,42.076545,-7.235691,32,92,VILARIÑO DE CONSO
14159,9.21,2017-01-01,42.557333,-8.176331,32,35,"IRIXO, O"
14160,2.00,2017-01-01,42.329344,-7.349575,32,29,CHANDREXA DE QUEIXA
14161,5.00,2017-01-01,42.184953,-6.963056,32,86,VIANA DO BOLO
...,...,...,...,...,...,...,...
15080,8.04,2018-10-28,41.888386,-7.378586,32,91,VILARDEVÓS
15081,3.00,2018-11-17,42.106457,-7.395366,32,39,LAZA
15082,3.09,2018-12-09,42.005198,-7.011511,32,48,"MEZQUITA, A"
15083,36.04,2018-12-30,42.168767,-7.011685,32,86,VIANA DO BOLO


In [10]:
"""from tqdm import tqdm
#It iterates each row, but it also adds a progress bar to know in what percentage we are
for i, row in tqdm(fires.head(20).iterrows(), total=fires.shape[0], desc="Fetching weather data", unit="row"):
#for i, row in fires.iterrows():
    # Set up the location (lat, lng) and date based on each row's data
    location = Point(row['lat'], row['lng'])

    date = row['fecha']
    #stations = Stations().nearby(row['lat'], row['lng'])
    #stations = stations.fetch(1)

    
    # Get daily weather data for the specific date and location
    weather_data = Daily(location, date, date)
    weather_data = weather_data.fetch()
    
    # Check if weather data is available for the date and location
    if not weather_data.empty:
        # Populate the row with weather data (using iloc[0] to get the first row of the result)
        fires.at[i, 'wind_speed'] = weather_data['wspd'].mean()
        fires.at[i, 'wind_gust'] = weather_data['wpgt'].mean() if 'wpgt' in weather_data else None
        fires.at[i, 'wind_direction'] = weather_data['wdir'].mean()
        fires.at[i, 'temperature'] = weather_data['tavg'].mean()
        fires.at[i, 'pressure'] = weather_data['pres'].mean()
        fires.at[i, 'precipitation'] = weather_data['prcp'].mean()

# Display the DataFrame with added weather columns
fires.head()"""


'from tqdm import tqdm\n#It iterates each row, but it also adds a progress bar to know in what percentage we are\nfor i, row in tqdm(fires.head(20).iterrows(), total=fires.shape[0], desc="Fetching weather data", unit="row"):\n#for i, row in fires.iterrows():\n    # Set up the location (lat, lng) and date based on each row\'s data\n    location = Point(row[\'lat\'], row[\'lng\'])\n\n    date = row[\'fecha\']\n    #stations = Stations().nearby(row[\'lat\'], row[\'lng\'])\n    #stations = stations.fetch(1)\n\n    \n    # Get daily weather data for the specific date and location\n    weather_data = Daily(location, date, date)\n    weather_data = weather_data.fetch()\n    \n    # Check if weather data is available for the date and location\n    if not weather_data.empty:\n        # Populate the row with weather data (using iloc[0] to get the first row of the result)\n        fires.at[i, \'wind_speed\'] = weather_data[\'wspd\'].mean()\n        fires.at[i, \'wind_gust\'] = weather_data[\'wpgt

In [11]:
fires.isna().sum()

superficie     0
fecha          0
lat            0
lng            0
idprovincia    0
idmunicipio    0
municipio      5
dtype: int64

In [12]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15085 entries, 221698 to 281994
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   superficie   15085 non-null  float64       
 1   fecha        15085 non-null  datetime64[ns]
 2   lat          15085 non-null  float64       
 3   lng          15085 non-null  float64       
 4   idprovincia  15085 non-null  int64         
 5   idmunicipio  15085 non-null  int64         
 6   municipio    15080 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 942.8+ KB


In [13]:
#We save the data with the weather
data_path="data/fires-weather.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"ERror while exporting the data to the excel file: {error}")

In [14]:
#This si done to avoid repeating teh fetching of weather that is an intensive task
data_path="data/fires-weather.xlsx"
try:
    fires=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
0,5.0,2005-01-06,42.011805,-8.026783,32,41,LOBEIRA
1,1.0,2005-01-06,42.525157,-8.123135,32,35,"IRIXO, O"
2,1.0,2005-01-06,42.33853,-6.787814,32,17,CARBALLEDA DE VALDEORRAS
3,1.2,2005-01-07,42.113543,-8.180216,32,56,PADRENDA
4,1.5,2005-01-07,42.304143,-8.241844,32,46,MELÓN


### Group by weeks

In [15]:
#TODO 

### Split train and test data

In [16]:
#We split 80% of the data for training the model
train_split=80


## Analyze data

In [17]:
#TODO

## Train models

### Time series

In [18]:
#TODO

### Con variables exogenas

## Performances

In [19]:
#TODO

## Test Model


In [20]:
#TODO