Importando Bibliotecas necessárias

In [1]:
!pip install -q -U "tensorflow==2.8.*"
!pip install transformers==4.37.2



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

import numpy as np
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.metrics import classification_report

import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping



Pré processamento de dados

In [3]:
# Carregando dataset
dados = pd.read_csv('amazon_alexa.tsv', sep='\t')

# Eliminando valores em branco nas colunas que serão utilizadas
dados.dropna(subset=['verified_reviews', 'feedback'], inplace=True)

#
dados['feedback'] = dados['feedback'].apply(lambda x: 'positive' if x == 1 else 'negative')
X = dados['verified_reviews']
y = dados['feedback']

# Rotulação
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Criando dataset de treino e validação
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Modelo 01 - SVM com Bag of Words(BOW)

In [4]:
# Criando Pipeline SVM com BOW
pipeline_bow = make_pipeline(CountVectorizer(), SVC(kernel='linear'))

# Treinando modelo 02
pipeline_bow.fit(X_train, y_train)

# Extraindo métricas do modelo 01
y_pred_bow = pipeline_bow.predict(X_test)
print("Métricas do modelo 01:\n", classification_report(y_test, y_pred_bow))

Métricas do modelo 01:
               precision    recall  f1-score   support

           0       0.70      0.43      0.54        92
           1       0.94      0.98      0.96       853

    accuracy                           0.93       945
   macro avg       0.82      0.71      0.75       945
weighted avg       0.92      0.93      0.92       945



Modelo 02 - SVM com Embeddings

In [5]:
# Separando os dados de treino em vetor
sentences = [review.split() for review in X_train]

# Criando o modelo a partir do vetor
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Função para obter os embeddings do dataset
def get_avg_word2vec(review, model, num_features):
    words = review.split()
    feature_vec = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# Transformando o dataset em vetores word embeddings
X_train_word2vec = np.array([get_avg_word2vec(review, word2vec_model, 100) for review in X_train])
X_test_word2vec = np.array([get_avg_word2vec(review, word2vec_model, 100) for review in X_test])

# Treinando modelo 02
svm_word2vec = SVC(kernel='linear')
svm_word2vec.fit(X_train_word2vec, y_train)

# Extraindo métricas do modelo 02
y_pred_word2vec = svm_word2vec.predict(X_test_word2vec)
print("Métricas do modelo 02:\n", classification_report(y_test, y_pred_word2vec))

Métricas do modelo 02:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.90      1.00      0.95       853

    accuracy                           0.90       945
   macro avg       0.45      0.50      0.47       945
weighted avg       0.81      0.90      0.86       945



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Modelo 03 - BERT

In [6]:
# Carregando BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Carregando Modelo
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenrizando dados
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Convertendo o dataset para tensor
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)

# Compilando modelo
optimizer = Adam(learning_rate=3e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Treinando o modelo
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(train_dataset, validation_data=test_dataset, epochs=10, callbacks=[early_stopping])

# Extraindo métricas do modelo 03
print("Métricas do modelo 03:\n", model.evaluate(test_dataset))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Métricas do modelo 03:
 [0.24900774657726288, 0.9513227343559265]
