## Data cleaning

### Set up

We first import the different libraries that we will be using for this project

In [1]:
import pandas as pd
from datetime import datetime 
from meteostat import Point, Daily, Stations

We import our dataset

In [2]:
data_path="data/fires-all.csv"
try:
    fires=pd.read_csv(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,id,superficie,fecha,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,municipio,...,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
0,1968290001,14.0,1968-01-01,,,0,4,29,0,INDETERMINADO,...,1,40,0,0,0,360,0,0,0,0
1,1968430003,3.0,1968-01-03,,,0,2,43,0,INDETERMINADO,...,1,0,0,0,0,60,0,0,0,0
2,1968290006,2.0,1968-01-06,,,0,4,29,0,INDETERMINADO,...,1,0,0,0,0,120,0,0,0,0
3,1968430016,600.0,1968-01-07,,,0,2,43,0,INDETERMINADO,...,1,20,0,0,0,1440,35,1,0,0
4,1968120007,8.2,1968-01-07,,,0,9,12,0,INDETERMINADO,...,1,20,0,0,0,120,0,0,0,0


We analyze the data and observe the type of data on each column and how many nulls values we have

In [3]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284589 entries, 0 to 284588
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               284589 non-null  int64  
 1   superficie       284589 non-null  float64
 2   fecha            284589 non-null  object 
 3   lat              230154 non-null  float64
 4   lng              230154 non-null  float64
 5   latlng_explicit  284589 non-null  int64  
 6   idcomunidad      284589 non-null  int64  
 7   idprovincia      284589 non-null  int64  
 8   idmunicipio      284589 non-null  int64  
 9   municipio        284576 non-null  object 
 10  causa            284589 non-null  int64  
 11  causa_supuesta   284589 non-null  int64  
 12  causa_desc       284589 non-null  int64  
 13  muertos          284589 non-null  int64  
 14  heridos          284589 non-null  int64  
 15  time_ctrl        284589 non-null  int64  
 16  time_ext         284589 non-null  int6

### Remove unnecesary columns

In [4]:
#drop
try:
    fires.drop(["id","causa","causa_supuesta","causa_desc",
                "muertos","heridos","time_ctrl","time_ext",
                "personal","medios","gastos","perdidas",
                "latlng_explicit"],axis=1, inplace=True)
except Exception as error:
    print(f"ERROR while droping the columns {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idcomunidad,idprovincia,idmunicipio,municipio
0,14.0,1968-01-01,,,4,29,0,INDETERMINADO
1,3.0,1968-01-03,,,2,43,0,INDETERMINADO
2,2.0,1968-01-06,,,4,29,0,INDETERMINADO
3,600.0,1968-01-07,,,2,43,0,INDETERMINADO
4,8.2,1968-01-07,,,9,12,0,INDETERMINADO


### Remove unnecesary rows

In [5]:
#We only want the fires of the comunidad 3 (Galicia)
fires=fires[fires["idcomunidad"]==3]
#We drop null values on lat and lng, beacsue we cannot find where the fire happend,
#  and it is older data from  1968
fires=fires.dropna(subset=["lat","lng"])
#We remove the column idcomunidad as it is no longer need it
fires.drop(["idcomunidad"],axis=1,inplace=True)
#print how many null values are for each column
fires.isna().sum(),
#TODO: from the coordinates lat and lng obtain the blanks municipio

(superficie     0
 fecha          0
 lat            0
 lng            0
 idprovincia    0
 idmunicipio    0
 municipio      5
 dtype: int64,)

In [6]:
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


### Add weather data

In [7]:
# Documentation (https://dev.meteostat.net/python/daily.html#example)
# We initialize the columns for the data
fires['wind_speed'] = None
fires['wind_gust'] = None
fires['wind_direction'] = None
fires['temperature'] = None
fires['pressure'] = None
fires['precipitation'] = None
#We convert the date fecha into datetime, to avoid warnings from pandas
fires['fecha'] = pd.to_datetime(fires['fecha'], errors='coerce')

In [None]:
from tqdm import tqdm
#It iterates each row, but it also adds a progress bar to know in what percentage we are
for i, row in tqdm(fires.head(20).iterrows(), total=fires.shape[0], desc="Fetching weather data", unit="row"):
#for i, row in fires.iterrows():
    # Set up the location (lat, lng) and date based on each row's data
    location = Point(row['lat'], row['lng'])

    date = row['fecha']
    #stations = Stations().nearby(row['lat'], row['lng'])
    #stations = stations.fetch(1)

    
    # Get daily weather data for the specific date and location
    weather_data = Daily(location, date, date)
    weather_data = weather_data.fetch()
    
    # Check if weather data is available for the date and location
    if not weather_data.empty:
        # Populate the row with weather data (using iloc[0] to get the first row of the result)
        fires.at[i, 'wind_speed'] = weather_data['wspd'].mean()
        fires.at[i, 'wind_gust'] = weather_data['wpgt'].mean() if 'wpgt' in weather_data else None
        fires.at[i, 'wind_direction'] = weather_data['wdir'].mean()
        fires.at[i, 'temperature'] = weather_data['tavg'].mean()
        fires.at[i, 'pressure'] = weather_data['pres'].mean()
        fires.at[i, 'precipitation'] = weather_data['prcp'].mean()

# Display the DataFrame with added weather columns
fires.head()


Fetching weather data:   0%|          | 2/81082 [00:00<3:10:48,  7.08row/s]

             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europe/Madrid          NaT        NaT  1951-01-01 2024-10-29   

      monthly_start monthly_end      distance  
id                                             
08053    1951-01-01  2022-01-01  29552.270115  
             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europ

Fetching weather data:   0%|          | 3/81082 [00:00<4:17:45,  5.24row/s]

                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.0  Europe/Madrid   1973-01-01 2024-11-06  1956-05-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08045 2024-10-30    1956-01-01  2022-01-01  21956.680738  
                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.

Fetching weather data:   0%|          | 6/81082 [00:00<3:10:47,  7.08row/s]

               name country region    wmo  icao  latitude  longitude  \
id                                                                     
08008  Lugo / Rozas      ES     GA  08008  <NA>   43.1167      -7.45   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08008      445.0  Europe/Madrid          NaT        NaT  1951-01-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08008 2024-10-30    1951-01-01  2022-01-01  11367.338974  
               name country region    wmo  icao  latitude  longitude  \
id                                                                     
08008  Lugo / Rozas      ES     GA  08008  <NA>   43.1167      -7.45   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08008      445.0  Europe/Madrid  

Fetching weather data:   0%|          | 7/81082 [00:01<3:20:46,  6.73row/s]

                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.0  Europe/Madrid   1973-01-01 2024-11-06  1956-05-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08045 2024-10-30    1956-01-01  2022-01-01  13120.156438  
                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.

Fetching weather data:   0%|          | 9/81082 [00:01<3:12:02,  7.04row/s]

             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08044  Pontevedra      ES     GA  08044  <NA>   42.4333    -8.6167      108.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08044  Europe/Madrid          NaT        NaT  1975-01-12 2024-10-29   

      monthly_start monthly_end      distance  
id                                             
08044    1982-01-01  2022-01-01  22053.396188  
             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08044  Pontevedra      ES     GA  08044  <NA>   42.4333    -8.6167      108.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08044  Europ

Fetching weather data:   0%|          | 12/81082 [00:01<3:05:21,  7.29row/s]

             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europe/Madrid          NaT        NaT  1951-01-01 2024-10-29   

      monthly_start monthly_end     distance  
id                                            
08053    1951-01-01  2022-01-01  54730.90714  
             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europe/M

Fetching weather data:   0%|          | 13/81082 [00:01<3:08:28,  7.17row/s]

                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.0  Europe/Madrid   1973-01-01 2024-11-06  1956-05-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08045 2024-10-30    1956-01-01  2022-01-01  28656.768155  
                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.

Fetching weather data:   0%|          | 16/81082 [00:02<2:45:08,  8.18row/s]

                       name country region    wmo  icao  latitude  longitude  \
id                                                                             
08042  Santiago / Labacolla      ES     GA  08042  LEST      42.9    -8.4333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08042      370.0  Europe/Madrid   1973-01-01 2024-11-06  1943-11-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08042 2024-10-30    1943-01-01  2022-01-01  39792.930537  
                       name country region    wmo  icao  latitude  longitude  \
id                                                                             
08042  Santiago / Labacolla      ES     GA  08042  LEST      42.9    -8.4333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                      

Fetching weather data:   0%|          | 18/81082 [00:02<2:47:39,  8.06row/s]

                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.0  Europe/Madrid   1973-01-01 2024-11-06  1956-05-01   

       daily_end monthly_start monthly_end      distance  
id                                                        
08045 2024-10-30    1956-01-01  2022-01-01  18376.410943  
                  name country region    wmo  icao  latitude  longitude  \
id                                                                        
08045  Vigo / Peinador      ES     GA  08045  LEVX   42.2167    -8.6333   

       elevation       timezone hourly_start hourly_end daily_start  \
id                                                                    
08045      264.

Fetching weather data:   0%|          | 20/81082 [00:02<3:05:32,  7.28row/s]

             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europe/Madrid          NaT        NaT  1951-01-01 2024-10-29   

      monthly_start monthly_end     distance  
id                                            
08053    1951-01-01  2022-01-01  40304.35179  
             name country region    wmo  icao  latitude  longitude  elevation  \
id                                                                              
08053  Ponferrada      ES     CL  08053  <NA>   42.5333    -6.5833      550.0   

            timezone hourly_start hourly_end daily_start  daily_end  \
id                                                                    
08053  Europe/M




Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio,wind_speed,wind_gust,wind_direction,temperature,pressure,precipitation
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O",,,,19.6,,0.0
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE,,,,12.1,,
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS",,,,9.1,,0.0
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ,,,,,,
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE,,,,4.2,,0.0


: 

In [None]:
fires.isna().sum()

In [None]:
fires.info()

In [None]:
#We save the data with the weather
data_path="data/fires-weather.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"ERror while exporting the data to the excel file: {error}")

In [None]:
#This si done to avoid repeating teh fetching of weather that is an intensive task
data_path="data/fires-weather.xlsx"
try:
    fires=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

### Group by weeks

In [None]:
#TODO 

### Split train and test data

In [None]:
#We split 80% of the data for training the model
train_split=80


## Analyze data

In [None]:
#TODO

## Train models

In [None]:
#TODO

## Performances

In [None]:
#TODO

## Test Model


In [None]:
#TODO