## Data cleaning

### Set up

We first import the different libraries that we will be using for this project

In [23]:
import pandas as pd
from datetime import datetime 
from meteostat import Point, Daily

We import our dataset

In [24]:
data_path="data/fires-all.csv"
try:
    fires=pd.read_csv(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,id,superficie,fecha,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,municipio,...,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
0,1968290001,14.0,1968-01-01,,,0,4,29,0,INDETERMINADO,...,1,40,0,0,0,360,0,0,0,0
1,1968430003,3.0,1968-01-03,,,0,2,43,0,INDETERMINADO,...,1,0,0,0,0,60,0,0,0,0
2,1968290006,2.0,1968-01-06,,,0,4,29,0,INDETERMINADO,...,1,0,0,0,0,120,0,0,0,0
3,1968430016,600.0,1968-01-07,,,0,2,43,0,INDETERMINADO,...,1,20,0,0,0,1440,35,1,0,0
4,1968120007,8.2,1968-01-07,,,0,9,12,0,INDETERMINADO,...,1,20,0,0,0,120,0,0,0,0


We analyze the data and observe the type of data on each column and how many nulls values we have

In [25]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284589 entries, 0 to 284588
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               284589 non-null  int64  
 1   superficie       284589 non-null  float64
 2   fecha            284589 non-null  object 
 3   lat              230154 non-null  float64
 4   lng              230154 non-null  float64
 5   latlng_explicit  284589 non-null  int64  
 6   idcomunidad      284589 non-null  int64  
 7   idprovincia      284589 non-null  int64  
 8   idmunicipio      284589 non-null  int64  
 9   municipio        284576 non-null  object 
 10  causa            284589 non-null  int64  
 11  causa_supuesta   284589 non-null  int64  
 12  causa_desc       284589 non-null  int64  
 13  muertos          284589 non-null  int64  
 14  heridos          284589 non-null  int64  
 15  time_ctrl        284589 non-null  int64  
 16  time_ext         284589 non-null  int6

### Remove unnecesary columns

In [26]:
#drop
try:
    fires.drop(["id","causa","causa_supuesta","causa_desc",
                "muertos","heridos","time_ctrl","time_ext",
                "personal","medios","gastos","perdidas",
                "latlng_explicit"],axis=1, inplace=True)
except Exception as error:
    print(f"ERROR while droping the columns {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idcomunidad,idprovincia,idmunicipio,municipio
0,14.0,1968-01-01,,,4,29,0,INDETERMINADO
1,3.0,1968-01-03,,,2,43,0,INDETERMINADO
2,2.0,1968-01-06,,,4,29,0,INDETERMINADO
3,600.0,1968-01-07,,,2,43,0,INDETERMINADO
4,8.2,1968-01-07,,,9,12,0,INDETERMINADO


### Remove unnecesary rows

In [27]:
#We only want the fires of the comunidad 3 (Galicia)
fires=fires[fires["idcomunidad"]==3]
#We drop null values on lat and lng, beacsue we cannot find where the fire happend,
#  and it is older data from  1968
fires=fires.dropna(subset=["lat","lng"])
#We remove the column idcomunidad as it is no longer need it
fires.drop(["idcomunidad"],axis=1,inplace=True)
#print how many null values are for each column
fires.isna().sum(),
#TODO: from the coordinates lat and lng obtain the blanks municipio

(superficie     0
 fecha          0
 lat            0
 lng            0
 idprovincia    0
 idmunicipio    0
 municipio      5
 dtype: int64,)

In [28]:
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio
39703,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O"
54407,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE
54410,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS"
54415,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ
54417,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE


### Add weather data

In [29]:
# Documentation (https://dev.meteostat.net/python/daily.html#example)
# We initialize the columns for the data
fires['wind_speed'] = None
fires['wind_gust'] = None
fires['wind_direction'] = None
fires['temperature'] = None
fires['pressure'] = None
fires['precipitation'] = None
#We convert the date fecha into datetime, to avoid warnings from pandas
fires['fecha'] = pd.to_datetime(fires['fecha'], errors='coerce')

In [None]:
from tqdm import tqdm
#It iterates each row, but it also adds a progress bar to know in what percentage we are
for i, row in tqdm(fires.iterrows(), total=fires.shape[0], desc="Fetching weather data", unit="row"):
#for i, row in fires.iterrows():
    # Set up the location (lat, lng) and date based on each row's data
    location = Point(row['lat'], row['lng'])
    print(location)
    break
    date = row['fecha'] 
    
    # Get daily weather data for the specific date and location
    weather_data = Daily(location, date, date)
    weather_data = weather_data.fetch()
    
    # Check if weather data is available for the date and location
    if not weather_data.empty:
        # Populate the row with weather data (using iloc[0] to get the first row of the result)
        fires.at[i, 'wind_speed'] = weather_data['wspd'].iloc[0]
        fires.at[i, 'wind_gust'] = weather_data['wpgt'].iloc[0] if 'wpgt' in weather_data else None
        fires.at[i, 'wind_direction'] = weather_data['wdir'].iloc[0]
        fires.at[i, 'temperature'] = weather_data['tavg'].iloc[0]
        fires.at[i, 'pressure'] = weather_data['pres'].iloc[0]
        fires.at[i, 'precipitation'] = weather_data['prcp'].iloc[0]

# Display the DataFrame with added weather columns
fires.head()


SyntaxError: invalid syntax (4088417403.py, line 8)

In [None]:
fires.isna().sum()

superficie            0
fecha                 0
lat                   0
lng                   0
idprovincia           0
idmunicipio           0
municipio             5
wind_speed        70338
wind_gust         81050
wind_direction    76461
temperature       42334
pressure          77094
precipitation     35046
dtype: int64

In [None]:
fires.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81082 entries, 39703 to 281994
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   superficie      81082 non-null  float64       
 1   fecha           81082 non-null  datetime64[ns]
 2   lat             81082 non-null  float64       
 3   lng             81082 non-null  float64       
 4   idprovincia     81082 non-null  int64         
 5   idmunicipio     81082 non-null  int64         
 6   municipio       81077 non-null  object        
 7   wind_speed      10744 non-null  object        
 8   wind_gust       32 non-null     object        
 9   wind_direction  4621 non-null   object        
 10  temperature     38748 non-null  object        
 11  pressure        3988 non-null   object        
 12  precipitation   46036 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(7)
memory usage: 10.7+ MB


In [None]:
#We save the data with the weather
data_path="data/fires-weather.xlsx"
try:
    fires.to_excel(data_path,index=False)
except Exception as error:
    print(f"ERror while exporting the data to the excel file: {error}")

In [None]:
#This si done to avoid repeating teh fetching of weather that is an intensive task
data_path="data/fires-weather.xlsx"
try:
    fires=pd.read_excel(data_path)
except Exception as error:
    print(f"Error while importing the excel file: {error}")
fires.head()

Unnamed: 0,superficie,fecha,lat,lng,idprovincia,idmunicipio,municipio,wind_speed,wind_gust,wind_direction,temperature,pressure,precipitation
0,20.0,1980-09-18,42.428281,-6.914337,32,9,"BARCO DE VALDEORRAS, O",,,,19.6,,0.0
1,2.0,1983-01-16,42.542185,-8.449205,36,12,COTOBADE,,,,12.1,,
2,6.0,1983-01-16,42.102572,-8.41592,36,34,"NEVES, AS",,,,9.1,,0.0
3,3.0,1983-01-17,43.629834,-7.367642,27,19,FOZ,,,,,,
4,40.0,1983-01-18,43.018968,-7.408954,27,11,CASTROVERDE,,,,4.2,,0.0


### Group by weeks

In [None]:
#TODO 

### Split train and test data

In [None]:
#We split 80% of the data for training the model
train_split=80


## Analyze data

In [None]:
#TODO

## Train models

In [None]:
#TODO

## Performances

In [None]:
#TODO

## Test Model


In [None]:
#TODO