In [18]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, ADASYN
import os
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
RANDOM_STATE = 15

Importar os dados

In [3]:
focos = pd.read_csv("dados/Focos_limpo.csv")
focos = focos.drop(columns=["frp"])
focos = focos.dropna()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(focos.drop(columns=["gravidade"]), focos["gravidade"], shuffle=True, random_state=RANDOM_STATE, test_size=.25)
Counter(y_test).items()

dict_items([(0, 440317), (1, 538)])

### Balanceamento de dados

In [5]:
X_train_balanced, y_train_balanced = ADASYN(random_state=RANDOM_STATE).fit_resample(X_train, y_train)
print(f"Antes : {Counter(y_train).items()}")
print(f"Depois : {Counter(y_train_balanced).items()}")

Antes : dict_items([(0, 1321001), (1, 1564)])
Depois : dict_items([(0, 1321001), (1, 1321079)])


## Treinamento: Multilayer Perceptron

In [6]:
mlp_pipeline = Pipeline([('scaler',  StandardScaler()),
            ('MLPClassifier', MLPClassifier(random_state = RANDOM_STATE))])

In [7]:
mlp_params = {
    "MLPClassifier__hidden_layer_sizes": [(1,),(5,), (10,)],
    "MLPClassifier__activation": ["logistic", "tanh", 'identity'],
    "MLPClassifier__alpha": [0.0005]
}
mlp_grid_search = GridSearchCV(mlp_pipeline, mlp_params, scoring="f1", cv=5, verbose=3)
mlp_grid_search = mlp_grid_search.fit(X_train_balanced, y_train_balanced)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(1,);, score=0.726 total time=  57.8s
[CV 2/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(1,);, score=0.709 total time= 1.0min
[CV 3/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(1,);, score=0.711 total time=  54.9s
[CV 4/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(1,);, score=0.735 total time=  52.8s
[CV 5/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(1,);, score=0.706 total time=  54.4s
[CV 1/5] END MLPClassifier__activation=logistic, MLPClassifier__alpha=0.0005, MLPClassifier__hidden_layer_sizes=(5,);, score=0.801 total time= 1.0min
[CV 2/5] END MLPClassifier__activation=l

In [10]:
mlp_grid_search.best_params_

{'MLPClassifier__activation': 'logistic',
 'MLPClassifier__alpha': 0.0005,
 'MLPClassifier__hidden_layer_sizes': (10,)}

In [11]:
mlp_best_model = mlp_grid_search.best_estimator_
mlp_trained = mlp_best_model.fit(X_train_balanced, y_train_balanced)
mlp_y_pred = mlp_trained.predict(X_test)
print(f"accuracy_score: {accuracy_score(y_test, mlp_y_pred)}")
print(f"f1_score: {f1_score(y_test, mlp_y_pred)}")
print(f"precision_score: {precision_score(y_test, mlp_y_pred)}")
print(f"recall_score: {recall_score(y_test, mlp_y_pred)}")

accuracy_score: 0.790953941772238
f1_score: 0.00779475253813938
precision_score: 0.003920082300070388
recall_score: 0.6728624535315985


In [19]:
confusion_matrix(y_test, mlp_y_pred)

array([[348334,  91983],
       [   176,    362]])

## Treinamento: Decision Tree

In [12]:
decision_tree_pipeline = Pipeline([('scaler',  StandardScaler()),
            ('DecisionTreeClassifier', DecisionTreeClassifier(random_state = RANDOM_STATE))])
decision_tree_params = {
    "DecisionTreeClassifier__criterion": ['gini', 'entropy', 'log_loss']
}
decision_tree_grid_search = GridSearchCV(decision_tree_pipeline, decision_tree_params, scoring="f1", cv=5, verbose=3)
decision_tree_grid_search = decision_tree_grid_search.fit(X_train_balanced, y_train_balanced)
decision_tree_grid_search.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END DecisionTreeClassifier__criterion=gini;, score=0.969 total time=  13.6s
[CV 2/5] END DecisionTreeClassifier__criterion=gini;, score=0.975 total time=  13.6s
[CV 3/5] END DecisionTreeClassifier__criterion=gini;, score=0.969 total time=  13.4s
[CV 4/5] END DecisionTreeClassifier__criterion=gini;, score=0.981 total time=  14.3s
[CV 5/5] END DecisionTreeClassifier__criterion=gini;, score=0.980 total time=  14.0s
[CV 1/5] END DecisionTreeClassifier__criterion=entropy;, score=0.972 total time=  15.7s
[CV 2/5] END DecisionTreeClassifier__criterion=entropy;, score=0.974 total time=  15.4s
[CV 3/5] END DecisionTreeClassifier__criterion=entropy;, score=0.969 total time=  16.2s
[CV 4/5] END DecisionTreeClassifier__criterion=entropy;, score=0.986 total time=  16.8s
[CV 5/5] END DecisionTreeClassifier__criterion=entropy;, score=0.982 total time=  16.9s
[CV 1/5] END DecisionTreeClassifier__criterion=log_loss;, score=0.972 total

{'DecisionTreeClassifier__criterion': 'entropy'}

In [13]:
decision_tree_best_model = decision_tree_grid_search.best_estimator_
decision_tree_trained = decision_tree_best_model.fit(X_train_balanced, y_train_balanced)
decision_tree_y_pred = decision_tree_trained.predict(X_test)
print("DECISION TREE")
print(f"accuracy_score: {accuracy_score(y_test, decision_tree_y_pred)}")
print(f"f1_score: {f1_score(y_test, decision_tree_y_pred)}")
print(f"precision_score: {precision_score(y_test, decision_tree_y_pred)}")
print(f"recall_score: {recall_score(y_test, decision_tree_y_pred)}")

DECISION TREE
accuracy_score: 0.9992469179208583
f1_score: 0.732258064516129
precision_score: 0.6467236467236467
recall_score: 0.8438661710037175


In [20]:
confusion_matrix(y_test, decision_tree_y_pred)

array([[440069,    248],
       [    84,    454]])

## Treinamento: SGDClassifier

In [15]:
sgd_pipeline = Pipeline([('scaler',  StandardScaler()),
            ('SGDClassifier', SGDClassifier(random_state = RANDOM_STATE))])
sgd_params = {
    "SGDClassifier__penalty": ['l2', 'elasticnet'],
    "SGDClassifier__max_iter": [1000, 3000, 5000]
}
sgd_grid_search = GridSearchCV(sgd_pipeline, sgd_params, scoring="f1", cv=5, verbose=3)
sgd_grid_search = sgd_grid_search.fit(X_train_balanced, y_train_balanced)
sgd_grid_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=l2;, score=0.731 total time=   3.3s
[CV 2/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=l2;, score=0.709 total time=   3.2s
[CV 3/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=l2;, score=0.733 total time=   3.2s
[CV 4/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=l2;, score=0.729 total time=   3.2s
[CV 5/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=l2;, score=0.728 total time=   3.2s
[CV 1/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=elasticnet;, score=0.731 total time=   3.6s
[CV 2/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=elasticnet;, score=0.709 total time=   3.6s
[CV 3/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=elasticnet;, score=0.734 total time=   3.6s
[CV 4/5] END SGDClassifier__max_iter=1000, SGDClassifier__penalty=elasticnet;, score=0.728 total tim

{'SGDClassifier__max_iter': 1000, 'SGDClassifier__penalty': 'l2'}

In [16]:
sgd_best_model = sgd_grid_search.best_estimator_
sgd_trained = sgd_best_model.fit(X_train_balanced, y_train_balanced)
sgd_y_pred = sgd_trained.predict(X_test)
print("SGD Classifier")
print(f"accuracy_score: {accuracy_score(y_test, sgd_y_pred)}")
print(f"f1_score: {f1_score(y_test, sgd_y_pred)}")
print(f"precision_score: {precision_score(y_test, sgd_y_pred)}")
print(f"recall_score: {recall_score(y_test, sgd_y_pred)}")

SGD Classifier
accuracy_score: 0.7021968674507492
f1_score: 0.0049265564128605855
precision_score: 0.0024733637747336376
recall_score: 0.604089219330855


In [21]:
confusion_matrix(y_test, sgd_y_pred)

array([[309242, 131075],
       [   213,    325]])