In [5]:
import dask.dataframe as dd
import pandas as pd

In [6]:
FILE_PATH = "assets/gbif.csv"
OUTPUT_FILE_PATH = "generated_files/gbif.parquet"

In [7]:
STATE_NAME_ENCODING = {
   "Santa Catarina":"SC",
   "São Paulo":"SP",
   "Rio Grande do Sul":"RS",
   "Minas Gerais":"MG",
   "Paraná":"PR",
   "Espírito Santo":"ES",
   "Brazil - São Paulo":"SP",
   "Rio de Janeiro":"RJ",
   "Brazil - Minas Gerais":"MG",
   "Bahia":"BA",
   "Mato Grosso do Sul":"MS",
   "Parana":"PR",
   "Brazil - Santa Catarina":"SC",
   "Sp":"SP"  
}

In [8]:
## Selecting and transforming data to correct data types
occurence_species_data_unfiltered = dd.read_csv(
       FILE_PATH,
       sep='\t',
       dtype={'dateIdentified': 'object',
       'day': 'float64',
       'establishmentMeans': 'object',
       'identifiedBy': 'object',
       'mediaType': 'object',
       'month': 'float64',
       'recordNumber': 'object',
       'rightsHolder': 'object',
       'verbatimScientificNameAuthorship': 'object',
       'year': 'float64'
}
).reset_index().compute().to_parquet(OUTPUT_FILE_PATH)

## Describing data before filtering

In [9]:
occurence_species_data = pd.read_parquet(
    OUTPUT_FILE_PATH, 
    columns=[
        'countryCode',
        'locality',
        'decimalLatitude',
        'decimalLongitude',
        'eventDate',
        'individualCount',
        'basisOfRecord',
        'collectionCode',
        'stateProvince'],
).reset_index(drop=True)

In [10]:
occurence_species_data = occurence_species_data[occurence_species_data['countryCode'] == 'BR'] # Filtering only in Brazil
occurence_species_data = occurence_species_data[~occurence_species_data['eventDate'].isna()].reset_index(drop=True) # Removing data that does not contain date information
occurence_species_data.loc[occurence_species_data['individualCount'].isna(),'individualCount'] = 1 # Setting 1 as default
occurence_species_data = occurence_species_data[occurence_species_data['decimalLatitude'].notna() & occurence_species_data['decimalLongitude'].notna()] # Removing data without any information about latitude and longitude
occurence_species_data['eventDate'] = pd.to_datetime(occurence_species_data['eventDate'], format="mixed", utc=True) # Setting column as datetime
occurence_species_data.loc[:,'stateProvince'] = occurence_species_data.stateProvince.replace(STATE_NAME_ENCODING) # Renaming

In [11]:
occurence_species_data = occurence_species_data[~occurence_species_data['stateProvince'].isnull()].reset_index(drop=True)

In [12]:
COLUMNS_RENAME = {
    "countryCode":"Pais",
    "locality":"Localizacao",
    "decimalLatitude":"Latitude",
    "decimalLongitude":"Longitude",
    "eventDate":"Data",
    "individualCount":"Contagem de individuos",
    "collectionCode":"Plataforma",
    "stateProvince":"Estado",
    "basisOfRecord":"Fonte do registro"
}

In [13]:
occurence_species_data.rename(columns=COLUMNS_RENAME, inplace=True)

In [14]:
occurence_species_data.to_parquet(OUTPUT_FILE_PATH, index=False)
occurence_species_data.to_csv("generated_files/gbif_treated.csv", index=False, sep=';')

In [15]:
occurence_species_data

Unnamed: 0,Pais,Localizacao,Latitude,Longitude,Data,Contagem de individuos,Fonte do registro,Plataforma,Estado
0,BR,PN do Itatiaia--área geral (Partes Baixa e Alta),-22.406586,-44.624233,2010-08-25 00:00:00+00:00,1.0,HUMAN_OBSERVATION,EBIRD,RJ
1,BR,PE Campos do Jordão (Horto Florestal),-22.689444,-45.481945,1999-06-03 00:00:00+00:00,1.0,HUMAN_OBSERVATION,EBIRD,SP
2,BR,PE Campos do Jordão (Horto Florestal),-22.689444,-45.481945,1999-06-02 00:00:00+00:00,1.0,HUMAN_OBSERVATION,EBIRD,SP
3,BR,Hotel Veraneio Hampel grounds,-29.465906,-50.677185,2013-11-28 00:00:00+00:00,2.0,HUMAN_OBSERVATION,EBIRD,RS
4,BR,Vicinity of Sao Joaquim,-28.142239,-50.097656,2013-11-27 00:00:00+00:00,2.0,HUMAN_OBSERVATION,EBIRD,SC
...,...,...,...,...,...,...,...,...,...
2842,BR,Campos do Jordão--área geral,-22.737240,-45.589830,2012-09-09 00:00:00+00:00,1.0,HUMAN_OBSERVATION,EBIRD,SP
2843,BR,FLONA São Francisco de Paula,-29.423851,-50.386770,2014-02-28 00:00:00+00:00,2.0,HUMAN_OBSERVATION,EBIRD,RS
2844,BR,near Barracao; Espigao Alto,-27.600000,-51.500000,1971-11-26 00:00:00+00:00,1.0,MACHINE_OBSERVATION,ML,RS
2845,BR,São Francisco de Paula; Centro de Pesquisas e ...,-29.477000,-50.170000,2013-11-09 00:00:00+00:00,1.0,MACHINE_OBSERVATION,ML,RS
