**Importando pacotes e bibliotecas que serão úteis para o CNN classifier**

In [None]:
#!pip install pydot

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, Flatten, Dense
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
import seaborn as sns
import plotly.graph_objects as go
from keras.optimizers import RMSprop
from sklearn.metrics import accuracy_score
from keras.utils import plot_model
from IPython.display import Image
from sklearn.feature_extraction.text import TfidfVectorizer

**Carregando a base de dados a ser utilizada**

In [None]:

df = pd.read_csv('data/df_sem_duplicatas.csv')

**Set-Up do LabelEncoder model**

In [None]:
label_encoder = LabelEncoder()
df['categoria'] = label_encoder.fit_transform(df['categoria'])

**Separando em treino e teste**

In [None]:
X = df['descricao'].values
y = df['categoria'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_length =  max(len(text) for text in df['descricao'])  

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

num_classes = len(label_encoder.classes_)

vectorizer = TfidfVectorizer(max_features=len(tokenizer.word_index) + 1) 
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

initial_output_dim =1
initial_cov1d_filters =2
initial_dense_units = 1
increment = True
i = 1
epochs =50
monitor_metric = 'val_loss'

early_stopping = EarlyStopping(monitor=monitor_metric, mode='min', patience=5, restore_best_weights=True)

**Representação grafica do teste / treino**

In [None]:
num_train = len(X_train)
num_test = len(X_test)

# Crie um gráfico de barras para mostrar a divisão
plt.figure(figsize=(6, 6))
plt.bar(['Validação', 'Teste'], [num_train, num_test], color=['blue', 'green'])
plt.xlabel('Conjunto de Dados')
plt.ylabel('Quantidade de Exemplos')
plt.title('Divisão entre Validação e Teste')
plt.show()

**Modelo Inicial de treinamento**

In [None]:
#esse loop é para aumentar a densidade da camada densa até que o early stopping seja ativado
densidade = 1  
while True: 
    model = tf.keras.Sequential([
        Dense(densidade, activation='relu', input_shape=(X_train_tfidf.shape[1],)),  
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_train_tfidf, y_train, epochs=epochs, validation_data=(X_test_tfidf, y_test), verbose=0, callbacks=[early_stopping])
    if early_stopping.stopped_epoch > 0:
        break
    densidade += 1

In [None]:
#verificação grafica da precisao e da perda do modelo inicial
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Treino')
plt.plot(val_acc, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Precisão')
plt.legend()
plt.title('Precisão - Treino e Validação')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Treino')
plt.plot(val_loss, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Perda')
plt.legend()
plt.title('Perda - Treino e Validação')

plt.tight_layout()
plt.show()

In [None]:
#Relatorio da classificação
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_classes)
print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))

In [None]:
#Resumo do modelo
print(f"Densidade: {densidade:.4f}")
print(f"Numero de classes: {num_classes:.4f}")
model.summary()

In [None]:
#Plotagem da matriz de confusão
conf_matrix = confusion_matrix(y_test_original,y_pred_original)
labels = np.unique(y_pred_original)
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matriz de Confusão')
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Rótulo Verdadeiro')
plt.xlabel('Rótulo Predito')
plt.show()

**Otimização do Modelo Inicial**

In [None]:
#esse loop é para aumentar a densidade da camada densa até que o early stopping seja ativado
densidade = 1  
while True:
    model = tf.keras.Sequential([
        Dense(densidade, activation='relu', input_shape=(X_train_tfidf.shape[1],)),  
        Dense(num_classes, activation='softmax')
    ])
    optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate=0.001)  
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    history = model.fit(X_train_tfidf, y_train, epochs=epochs, validation_data=(X_test_tfidf, y_test), verbose=0, callbacks=[early_stopping])
    if early_stopping.stopped_epoch > 0:
        break
    densidade += 1

In [None]:
#verificação grafica da precisao e da perda 
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Treino')
plt.plot(val_acc, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Precisão')
plt.legend()
plt.title('Precisão - Treino e Validação')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Treino')
plt.plot(val_loss, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Perda')
plt.legend()
plt.title('Perda - Treino e Validação')

plt.tight_layout()
plt.show()

In [None]:
#Relatorio da classificação
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_classes)
print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))

In [None]:
#Resumo do modelo
print(f"Densidade: {densidade:.4f}")
print(f"Numero de classes: {num_classes:.4f}")
model.summary()

In [None]:
#Plotagem da matriz de confusão
conf_matrix = confusion_matrix(y_test_original,y_pred_original)
labels = np.unique(y_pred_original)
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matriz de Confusão')
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Rótulo Verdadeiro')
plt.xlabel('Rótulo Predito')
plt.show()

**Modelo com Conv1D**

In [None]:
#esse loop no modelo Conv1D é para aumentar o numero de filtros até que o early stopping seja ativado
densidade = 1 
cov1d_filters = 1 
while True:
    model = tf.keras.Sequential([
        Conv1D(filters=cov1d_filters, kernel_size=3, activation='relu', input_shape=(X_train_tfidf.shape[1], 1)),  
        Flatten(),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    X_train_tfidf_reshaped = X_train_tfidf.reshape(X_train_tfidf.shape[0], X_train_tfidf.shape[1], 1)
    X_test_tfidf_reshaped = X_test_tfidf.reshape(X_test_tfidf.shape[0], X_test_tfidf.shape[1], 1)
    history = model.fit(X_train_tfidf_reshaped, y_train, epochs=epochs, validation_data=(X_test_tfidf_reshaped, y_test), verbose=0, callbacks=[early_stopping])
    if early_stopping.stopped_epoch > 0:
        break
    cov1d_filters += 1

In [None]:
#verificação grafica da precisao e da perda 
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Treino')
plt.plot(val_acc, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Precisão')
plt.legend()
plt.title('Precisão - Treino e Validação')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Treino')
plt.plot(val_loss, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Perda')
plt.legend()
plt.title('Perda - Treino e Validação')

plt.tight_layout()
plt.show()

In [None]:
#Plotagem da Matriz de Confusão
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_classes)
conf_matrix = confusion_matrix(y_test_original,y_pred_original)
labels = np.unique(y_pred_original)
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matriz de Confusão')
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Rótulo Verdadeiro')
plt.xlabel('Rótulo Predito')
plt.show()

In [None]:
#Resumo da classificação
print(f"Densidade: {densidade:.4f}")
print(f"Numero de classes: {num_classes:.4f}")
print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))
model.summary()

**Otimizando modelo com Conv1D - Adicionando Embedding**

In [None]:
#Esse loop incrementa o numero de filtros até que o early stopping seja ativado
while increment:
    output_dim = initial_output_dim + 1 
    cov1d_filters = initial_cov1d_filters + 1
    model = tf.keras.Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=output_dim, input_length=max_length),
        Conv1D(filters=cov1d_filters, kernel_size=1, activation='relu'),
        Flatten(),
        Dense(num_classes, activation='softmax')  
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_padded, y_train, epochs=epochs, validation_data=(X_test_padded, y_test), verbose=0, callbacks=[early_stopping])
    if early_stopping.stopped_epoch > 0:
        break
    i += 1

In [None]:
#verificação grafica da precisao e da perda 
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Treino')
plt.plot(val_acc, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Precisão')
plt.legend()
plt.title('Precisão - Treino e Validação')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Treino')
plt.plot(val_loss, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Perda')
plt.legend()
plt.title('Perda - Treino e Validação')

plt.tight_layout()
plt.show()

In [None]:
#Adicionando optimizer RMSprop
while increment:
    output_dim = initial_output_dim + 1 
    cov1d_filters = initial_cov1d_filters + 1
    model = tf.keras.Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=output_dim, input_length=max_length),
        Conv1D(filters=cov1d_filters, kernel_size=1, activation='relu'),
        Flatten(),
        Dense(num_classes, activation='softmax')  
    ])
    optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_padded, y_train, epochs=epochs, validation_data=(X_test_padded, y_test), verbose=0, callbacks=[early_stopping])
    history_df = pd.DataFrame(history.history)
    if early_stopping.stopped_epoch > 0:
        break
    i += 1

In [None]:
#verificação grafica da precisao e da perda
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(acc, label='Treino')
plt.plot(val_acc, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Precisão')
plt.legend()
plt.title('Precisão - Treino e Validação')

plt.subplot(1, 2, 2)
plt.plot(loss, label='Treino')
plt.plot(val_loss, label='Validação')
plt.xlabel('Épocas')
plt.ylabel('Perda')
plt.legend()
plt.title('Perda - Treino e Validação')

plt.tight_layout()
plt.show()

In [None]:
# Avaliação do modelo e obtenção as previsões
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_classes)

In [None]:
#Plotagem da matriz de confusão
conf_matrix = confusion_matrix(y_test_original,y_pred_original)
labels = np.unique(y_pred_original)
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matriz de Confusão')
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Rótulo Verdadeiro')
plt.xlabel('Rótulo Predito')
plt.show()

In [None]:
#Relatório da classificação
print(f"Output dim: {output_dim:.4f}")
print(f"cov1d_filters: {cov1d_filters:.4f}")
print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))

In [None]:
#Resumo do modelo
model.summary()