# Projeto #1 - Classificação

<p style='text-align: justify;'><font size=3.>Análise de acidentes nas rodovias federais brasileira entre 2007 e 2021.</font></p>

### Bibliotecas básicas e outros imports

In [None]:
import warnings
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score

%run modules/text.py

%matplotlib inline
plt.rcParams['font.family'] = 'Arial, Helvetica, sans-serif'

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
np.set_printoptions(suppress=True, precision=3)

### Carregamento dos dados

In [None]:
df = pd.read_csv('dataset/classifier_acidentes_rodovias_brasileiras_2007_a_2021.csv', low_memory=False, encoding='utf-8')
print(f'Linhas: {df.shape[0]} | Coluna:  {df.shape[1]}')
df.head(1)

### Agrupamento estado físico

In [None]:
df['classe'] = df['estado_fisico'].map(mapEstadoFisico)
print('Proporção de dados na base:')
print((df['classe'].value_counts(normalize=True)*100).round(1))

### Balanceamento da base

In [None]:
com_mortos = df.query('classe=="com mortos"')
sem_mortos = df.query('classe!="com mortos"').groupby('classe').apply(lambda x : x.sample(n=34253, replace=False))
sample = pd.concat([com_mortos, sem_mortos]).reset_index(drop=True)
print('Proporção de dados na base:')
print((sample['classe'].value_counts(normalize=True)*100).round(1))

### Seleção de features

In [None]:
colunas = [
    'dia_semana',
    'fase_dia',
    'tipo_pista',
    'sentido_via',
    'tracado_via',
    'condicao_metereologica',
    'tipo_acidente',
    'tipo_veiculo',
    'uf',
    'br',
    'sexo',
    'faixa_etaria',
    'classe'
]

dados = sample[colunas]
dados.head(1)

### Separação dos dados

In [None]:
X = dados.drop(['classe'], axis=1)
y = dados['classe']

### Label encoder

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

### Vetorização dos dados categóricos

In [None]:
encoder = LabelEncoder()
for col in X.columns:
    X[col] = encoder.fit_transform(X[col].astype(str))

### Divisão em treino e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    shuffle=True, 
                                                    stratify=y)
print('Shapes: ')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

### Testando famílias de algoritmos

In [None]:
estimators = [
    RandomForestClassifier(n_estimators=700, max_depth=3),
    SVC(C=1.0, kernel='rbf'),
    ExtraTreesClassifier(n_estimators=300, max_depth=3),
    KNeighborsClassifier(n_neighbors=10),
    RidgeClassifier(alpha=0.01),
    MLPClassifier(hidden_layer_sizes=[512, 256, 128], activation='relu', solver='adam', alpha=1e-5, max_iter=35, batch_size=100, shuffle=True, verbose=False)
]

for clf in estimators:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f'Resultado para o classificador {clf}')
    print(accuracy_score(y_test, y_pred))
    print('----------------------------------------------')

### Modelagem dos dados

Modelo

In [None]:
clf = MLPClassifier(hidden_layer_sizes=[512, 256, 128], 
                    activation='relu', 
                    solver='adam', alpha=1e-5, max_iter=35, batch_size=100, shuffle=True, verbose=False)

Validação cruzada

In [None]:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy', verbose=True, n_jobs=2)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Relatório de classificação

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_.tolist()))

Curva ROC - AUC

In [None]:
auc = round(roc_auc_score(y_test, y_pred), 2)
print(f'AUC: {auc}', '/n')

y_prob = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test,  y_prob)
plt.rcParams['figure.figsize'] = [6, 4]
plt.plot(fpr, tpr, label='MLP - auc='+str(auc), color='black')
plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), linestyle='--', label='Baseline')
plt.legend(loc=4)
plt.title('AUC', y=1.03, size=14)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive rate')
plt.grid(False)
plt.savefig('auc_acidentes_fatais.png')
plt.show()

Matriz de confusão

In [None]:
np.set_printoptions(precision=2)
plt.rcParams['figure.figsize'] = [10, 5]
disp = plot_confusion_matrix(clf, X_test, y_test,
                                 display_labels=le.classes_.tolist(),
                                 cmap=plt.cm.afmhot_r,
                                 normalize='true'
                                 )
disp.ax_.set_title('Normalized confusion matrix', y=1.05)
plt.grid(False)
plt.savefig('matriz_confusao_acidentes_fatais.png')
plt.show()