# Utilizar modelo linear para prever dados de queimadas no estado do Amazonas

## Imports

In [27]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

## Load datasets

In [2]:
kaggle_data = pd.read_csv("../../dados/amazon.csv", sep=',', engine="python")
inpe_data = pd.read_csv("../../dados/Focos_2018-01-01_2018-12-31.csv", sep=',', encoding='utf-8', engine="python")

## Prepare Data

In [3]:
def convert_timestamp_to_month(value):
    timestamp = str(value)
    return int(timestamp.split('/')[1])
    
def convert_timestamp_to_year(value):
    timestamp = str(value)
    return int(timestamp.split('/')[0])

def process_number(value):
    if str(value).split(".")[1] == "0":
        return int(value)
    else:
        return int(float(value) * 1000)
    
inpe_data['month'] = inpe_data['datahora'].map(convert_timestamp_to_month)
inpe_data['year'] = inpe_data['datahora'].map(convert_timestamp_to_year)
kaggle_data['number'] = kaggle_data['number'].map(process_number)

In [4]:
kaggle_data.columns

Index(['year', 'state', 'month', 'number', 'date'], dtype='object')

In [5]:
month_text_to_number = {
    "Janeiro": 1,
    "Fevereiro": 2,
    "Março": 3,
    "Abril": 4,
    "Maio": 5,
    "Junho": 6,
    "Julho": 7,
    "Agosto": 8,
    "Setembro": 9,
    "Outubro": 10,
    "Novembro": 11,
    "Dezembro": 12
}

def convert_month_text_to_number(value): 
    return month_text_to_number[str(value)]

kaggle_data['month'] = kaggle_data['month'].map(convert_month_text_to_number)

In [6]:
features = ['year', 'month']
features2 = features.copy()

for feature in features2:
    kaggle_data[feature + "^2"] = np.array(kaggle_data[feature].values) ** 2
    features.append(feature + "^2")

target_feature = ['number']

In [7]:
amazonas_treino = kaggle_data.loc[kaggle_data['state'] == "Amazonas"]

In [8]:
amazonas_test = inpe_data.loc[inpe_data['estado'] == "AMAZONAS"]

In [9]:
amazonas_test = amazonas_test.groupby("month")['estado'].count()

In [10]:
X_train, y_train = amazonas_treino[features], amazonas_treino[target_feature]

In [29]:
amazonas_test.values

array([  46,   93,   54,   14,   19,  123, 1346, 2589, 4928, 1725,  472,
         37], dtype=int64)

## Linear Modelling

In [52]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [56]:
X_2018 = []
for i in range(12):
    X_2018.append([2018, i+1, 2018**2, (i+1) ** 2])
   
y_predicted = model.predict(X_2018)
print(mean_absolute_error(y_predicted, amazonas_test.values), r2_score(y_predicted, amazonas_test.values))

310.49166666666673 0.8479325319882955


In [54]:
kaggle_data.loc[kaggle_data['year'] == 2017].loc[kaggle_data['state'] == "Amazonas"]

Unnamed: 0,year,state,month,number,date,year^2,month^2
737,2017,Amazonas,1,65,2017-01-01,4068289,1
757,2017,Amazonas,2,52,2017-01-01,4068289,4
777,2017,Amazonas,3,45,2017-01-01,4068289,9
797,2017,Amazonas,4,20,2017-01-01,4068289,16
817,2017,Amazonas,5,40,2017-01-01,4068289,25
837,2017,Amazonas,6,119,2017-01-01,4068289,36
857,2017,Amazonas,7,1975,2017-01-01,4068289,49
877,2017,Amazonas,8,6316,2017-01-01,4068289,64
897,2017,Amazonas,9,4033,2017-01-01,4068289,81
917,2017,Amazonas,10,1581,2017-01-01,4068289,100


In [47]:
kaggle_data.loc[kaggle_data['year'] == 2017].loc[kaggle_data['state'] == "Amazonas"]['number'].iloc[7] + 1

6317

In [48]:
model.coef_

AttributeError: 'RandomForestRegressor' object has no attribute 'coef_'

In [None]:
model.intercept_

In [49]:
features

['year', 'month', 'year^2', 'month^2']