# Generic Feature Engineering

Esse notebook tem como objetivo estender os dados através de trasnformações que dizem respeito ao domínio de aplicação.

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load datasets

In [2]:
kaggle_data = pd.read_csv("../../dados/Preprocessed/preprocessed_kaggle.csv")
inpe_data = pd.read_csv("../../dados/Preprocessed/preprocessed_inpe.csv")

In [3]:
inpe_data.columns = ['state', 'city', 'biome', 'days_without_rain', 
                     'precipitacao', 'risk_of_fire', 'latitude',
                     'longitude', 'frp', 'month', 'year', 'day','hour']

In [4]:
kaggle_data.columns= ['year', 'state', 'month', 'incidents', 'day']

## Merging new info

### Region of Brazil

In [5]:
region_state = {"state": ['amapa', 'tocantins', 'para', 'piaui', 'ceara', 'maranhao',
       'mato grosso', 'rio grande do norte', 'sergipe', 'alagoas',
       'paraiba', 'bahia', 'minas gerais', 'amazonas', 'roraima',
       'rio grande do sul', 'pernambuco', 'rondonia', 'goias',
       'rio de janeiro', 'sao paulo', 'espirito santo',
       'mato grosso do sul', 'santa catarina', 'parana', 'acre',
       'distrito federal'],
       "region": ["norte", "centro-oeste", "norte", "nordeste", "nordeste", 
                  "nordeste", "centro-oeste", "nordeste", "nordeste", "nordeste",
                 "nordeste", "nordeste", "centro-oeste", "norte", "norte",
                 "sul", "nordeste", "norte", "centro-oeste", 
                  "sudeste", "sudeste", "sudeste", 
                  "centro-oeste", "sul", "sul", "norte", "centro-oeste"]}

inpe_data = inpe_data.merge(pd.DataFrame(region_state), how="inner", on="state")
kaggle_data = kaggle_data.merge(pd.DataFrame(region_state), how="inner", on="state")


## Agregating inpe_data and kaggle_data

In [6]:
estados = inpe_data['state'].unique()
meses = inpe_data['month'].unique()

agregated_data = {
    'state': [],
    'month': [],
    'incidents': [],
    'year': [],
    'region': []
}

for estado in estados:
    for mes in meses:
        aux = inpe_data.loc[inpe_data['month'] == mes].loc[inpe_data['state'] == estado]
        agregated_data['incidents'].append(len(aux))
        agregated_data['month'].append(mes)
        agregated_data['year'].append(2018)
        agregated_data['state'].append(estado)
        agregated_data['region'].append(inpe_data.loc[inpe_data['state'] == estado]['region'].iloc[0])
        
agregated_inpe_data = pd.DataFrame(agregated_data)

In [7]:
estados = kaggle_data['state'].unique()

kaggle_agregated_data = {
    'state': [],
    'month': [],
    'incidents': [],
    'year': [],
    'region': [],
}

for estado in estados:
    for mes in range(1, 13, 1):
        for ano in range(1998, 2018, 1):
            kaggle_agregated_data['incidents'].append(kaggle_data.loc[kaggle_data['state'] == estado].loc[kaggle_data['year'] == ano].loc[kaggle_data['month'] == mes].sum()['incidents'])
            kaggle_agregated_data['state'].append(estado)
            kaggle_agregated_data['month'].append(mes)
            kaggle_agregated_data['year'].append(ano)
            kaggle_agregated_data['region'].append(kaggle_data.loc[kaggle_data['state'] == estado]['region'].iloc[0])

kaggle_agregated_data = pd.DataFrame(kaggle_agregated_data)

## Can we add 'biome' to agregated datasets?

The main problem here is that kaggle_data do not have a columns about biome. So, my ideia to solve this problem is look to inpe data and try to fill the kaggle dataset. 

### Lets look to the information carried by state about biome

In [8]:
for state in inpe_data['state'].unique():
    print(state, inpe_data.loc[inpe_data['state'] == state]['biome'].unique())

amapa ['Amazonia']
tocantins ['Cerrado' 'Amazonia']
para ['Amazonia']
piaui ['Caatinga' 'Cerrado']
ceara ['Caatinga']
maranhao ['Cerrado' 'Amazonia' 'Caatinga']
mato grosso ['Cerrado' 'Amazonia' 'Pantanal']
rio grande do norte ['Caatinga' 'Mata Atlantica']
sergipe ['Mata Atlantica' 'Caatinga']
alagoas ['Mata Atlantica' 'Caatinga']
paraiba ['Mata Atlantica' 'Caatinga']
bahia ['Cerrado' 'Mata Atlantica' 'Caatinga']
minas gerais ['Cerrado' 'Mata Atlantica' 'Caatinga']
amazonas ['Amazonia']
roraima ['Amazonia']
rio grande do sul ['Pampa' 'Mata Atlantica']
pernambuco ['Caatinga' 'Mata Atlantica']
rondonia ['Amazonia' 'Cerrado']
goias ['Cerrado' 'Mata Atlantica']
rio de janeiro ['Mata Atlantica']
sao paulo ['Cerrado' 'Mata Atlantica']
espirito santo ['Mata Atlantica']
mato grosso do sul ['Cerrado' 'Mata Atlantica' 'Pantanal']
santa catarina ['Mata Atlantica']
parana ['Mata Atlantica' 'Cerrado']
acre ['Amazonia']
distrito federal ['Cerrado']


In [9]:
inpe_data['biome'].unique()

array(['Amazonia', 'Cerrado', 'Caatinga', 'Pantanal', 'Mata Atlantica',
       'Pampa'], dtype=object)

So what we can do here is just an one hot encoding of 'biome', with true values when the state has the biome and false when it doesnt.

In [10]:
state_and_biomes = {
    "state": [], 'Amazonia': np.zeros(len(inpe_data['state'].unique())), 'Cerrado': np.zeros(len(inpe_data['state'].unique())),
    'Caatinga': np.zeros(len(inpe_data['state'].unique())), 'Pantanal': np.zeros(len(inpe_data['state'].unique())),
    'Mata Atlantica': np.zeros(len(inpe_data['state'].unique())), 'Pampa': np.zeros(len(inpe_data['state'].unique()))
}

count = 0
for state in inpe_data['state'].unique():
    biomes_list = inpe_data.loc[inpe_data['state'] == state]['biome'].unique()
    state_and_biomes['state'].append(state)
    for biome in biomes_list:
        state_and_biomes[biome][count] = 1
    count += 1
    
state_and_biomes = pd.DataFrame(state_and_biomes)
kaggle_agregated_data = kaggle_agregated_data.merge(state_and_biomes, on="state", how="inner")
agregated_inpe_data = agregated_inpe_data.merge(state_and_biomes, on="state", how="inner")

## Can we add 'latitude' and 'longitude' to agregated datasets?

The main problem here is that kaggle_data do not have a columns about latitude and longitude. So, my ideia to solve this problem is look to take the mean value per state. 

In [11]:
state_to_lat_long = {'state': [], 'latitude_mean': [], 'longitude_mean': []}

for state in inpe_data['state'].unique():
    state_to_lat_long['state'].append(state)
    state_to_lat_long['latitude_mean'].append(inpe_data.loc[inpe_data['state'] == state]['latitude'].mean())
    state_to_lat_long['longitude_mean'].append(inpe_data.loc[inpe_data['state'] == state]['longitude'].mean())
    
aux_df = pd.DataFrame(state_to_lat_long)
kaggle_agregated_data = kaggle_agregated_data.merge(aux_df, on="state", how="inner")
agregated_inpe_data = agregated_inpe_data.merge(aux_df, on="state", how="inner")

## Save Datasets

In [12]:
agregated_data = kaggle_agregated_data.append(agregated_inpe_data)

agregated_data.to_csv("..\\..\\dados\\Feature Engineered\\agregated_data.csv", index=False)
inpe_data.to_csv("..\\..\\dados\\Feature Engineered\\inpe_engineered.csv", index=False)
kaggle_data.to_csv("..\\..\\dados\\Feature Engineered\\kaggle_engineered.csv", index=False)
agregated_inpe_data.to_csv("..\\..\\dados\\Feature Engineered\\inpe_agregated_engineered.csv", index=False)
kaggle_agregated_data.to_csv("..\\..\\dados\\Feature Engineered\\kaggle_agregated_engineered.csv", index=False)