# Feature Engineering

Esse notebook tem como objetivo estender os dados através de trasnformações que dizem respeito ao domínio de aplicação.

## Imports

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load datasets

In [11]:
kaggle_data = pd.read_csv("../../dados/Preprocessed/preprocessed_kaggle.csv")
inpe_data = pd.read_csv("../../dados/Preprocessed/preprocessed_inpe.csv")

In [12]:
inpe_data.columns = ['state', 'city', 'biome', 'days_without_rain', 
                     'precipitacao', 'risk_of_fire', 'latitude',
                     'longitude', 'frp', 'month', 'year', 'day','hour']

In [13]:
kaggle_data.columns= ['year', 'state', 'month', 'incidents', 'day']

## Merging new info

### Region of Brazil

In [14]:
region_state = {"state": ['amapa', 'tocantins', 'para', 'piaui', 'ceara', 'maranhao',
       'mato grosso', 'rio grande do norte', 'sergipe', 'alagoas',
       'paraiba', 'bahia', 'minas gerais', 'amazonas', 'roraima',
       'rio grande do sul', 'pernambuco', 'rondonia', 'goias',
       'rio de janeiro', 'sao paulo', 'espirito santo',
       'mato grosso do sul', 'santa catarina', 'parana', 'acre',
       'distrito federal'],
       "region": ["norte", "centro-oeste", "norte", "nordeste", "nordeste", 
                  "nordeste", "centro-oeste", "nordeste", "nordeste", "nordeste",
                 "nordeste", "nordeste", "centro-oeste", "norte", "norte",
                 "sul", "nordeste", "norte", "centro-oeste", 
                  "sudeste", "sudeste", "sudeste", 
                  "centro-oeste", "sul", "sul", "norte", "centro-oeste"]}

inpe_data = inpe_data.merge(pd.DataFrame(region_state), how="inner", on="state")
kaggle_data = kaggle_data.merge(pd.DataFrame(region_state), how="inner", on="state")


## Agregating inpe_data and kaggle_data

In [15]:
estados = inpe_data['state'].unique()
meses = inpe_data['month'].unique()

agregated_data = {
    'state': [],
    'month': [],
    'incidents': [],
    'year': [],
    'region': []
}

for estado in estados:
    for mes in meses:
        aux = inpe_data.loc[inpe_data['month'] == mes].loc[inpe_data['state'] == estado]
        agregated_data['incidents'].append(len(aux))
        agregated_data['month'].append(mes)
        agregated_data['year'].append(2018)
        agregated_data['state'].append(estado)
        agregated_data['region'].append(inpe_data.loc[inpe_data['state'] == estado]['region'].iloc[0])
        
agregated_inpe_data = pd.DataFrame(agregated_data)

In [20]:
estados = kaggle_data['state'].unique()

kaggle_agregated_data = {
    'state': [],
    'month': [],
    'incidents': [],
    'year': [],
    'region': [],
}

for estado in estados:
    for mes in range(1, 13, 1):
        for ano in range(1998, 2018, 1):
            kaggle_agregated_data['incidents'].append(kaggle_data.loc[kaggle_data['state'] == estado].loc[kaggle_data['year'] == ano].loc[kaggle_data['month'] == mes].sum()['incidents'])
            kaggle_agregated_data['state'].append(estado)
            kaggle_agregated_data['month'].append(mes)
            kaggle_agregated_data['year'].append(ano)
            kaggle_agregated_data['region'].append(kaggle_data.loc[kaggle_data['state'] == estado]['region'].iloc[0])

kaggle_agregated_data = pd.DataFrame(kaggle_agregated_data)

## Save Datasets

In [23]:
inpe_data.to_csv("..\\..\\dados\\Feature Engineered\\inpe_engineered.csv", index=False)
kaggle_data.to_csv("..\\..\\dados\\Feature Engineered\\kaggle_engineered.csv", index=False)
agregated_inpe_data.to_csv("..\\..\\dados\\Feature Engineered\\inpe_agregated_engineered.csv", index=False)
kaggle_agregated_data.to_csv("..\\..\\dados\\Feature Engineered\\kaggle_agregated_engineered.csv", index=False)