In [176]:
# Bibliotecas de manipulação de dados
import pandas as pd

# Processamento de texto e NLP
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet as wn, sentiwordnet as swn
from googletrans import Translator
nltk.download('vader_lexicon')

# Métricas e avaliação
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, f1_score,)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/medaracaityte/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## 2.3

In [177]:
# Carregar o dataset
test_df = pd.read_csv('amazon_reviews_test.csv')

### Funções

#### Funções de Pré-Processamento

In [178]:
# Remoção de duplicados
def remove_duplicate_words(text):
    """
    Remove palavras duplicadas de um texto, preservando a ordem da primeira ocorrência.
    Exemplo: 'bom bom dia dia' -> 'bom dia'
    """
    words = text.split()
    seen = set()
    unique_words = []
    for word in words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)
    return ' '.join(unique_words)

In [179]:
#Traduzir as reviews não-inglesas para inglês
translator = Translator()
# Função para traduzir texto para inglês
def translate_to_english(text, src_lang):
    if src_lang == 'en' or src_lang == 'error':
        return text  # Não traduzir se já for inglês ou erro na deteção
    try:
        translation = translator.translate(text, src=src_lang, dest='en')
        return translation.text
    except Exception as e:
        print(f"Erro ao traduzir: {e}")
        return text  # Se falhar, devolve o texto original

In [180]:
# Definir a função de Tokenização
def word_based_tokenization(text):
    # Usar os espaços para dividir em tokens
    tokens_space = text.split()

    # Separar sinais de pontuação (ex.: ".", ",") usando regex
    text_with_punct = re.sub(r'([.,!?"])', r' \1 ', text)  # Adiciona espaço ao redor da pontuação
    text_with_punct = re.sub(r'\s+', ' ', text_with_punct)  # Remove múltiplos espaços

    # Tokenizar novamente após manipulação de pontuação e converter para minúsculas
    tokens_lower = [token.lower() for token in text_with_punct.split()]

    return ' '.join(tokens_lower)  # Retorna os tokens como uma string unificada

In [181]:
# Apagar quebras de linha e fazer casefolding
def remove_newlines_and_lower(text):
    text = re.sub(r'[\n\r]', '', text)
    return text.lower()

In [182]:
# Função para remover tags HTML específicas
def remove_specific_html_tags(text):
    # Substituir <br /> por quebra de linha
    text = text.replace('<br />', '\n')
    # Remover <a> e </a>
    text = re.sub(r'<a[^>]*>', '', text)  # Remove <a href="">
    text = text.replace('</a>', '')
    # Remover <span class="tiny"> e </span>
    text = re.sub(r'<span[^>]*>', '', text)  # Remove <span class="tiny">
    text = text.replace('</span>', '')
    # Remover <p> e </p>
    text = text.replace('<p>', '').replace('</p>', '')

    return text

In [183]:
# Encontrar e remover caracteres estranhos
padrao = re.compile(r"[^a-zA-Z0-9\s.,;:!?\"'()\[\]{}@#%&/\-_=+<>€$£]")

def remove_caracteres_estranhos(text):
    return padrao.sub('', str(text))

In [184]:
# Remover espaços duplos
def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

In [185]:
# Substituição dos emojis por texto
def replace_emojis(text):
    """
    Substitui emojis por representações textuais.
    """
    emoji_dict = {
        ':)': 'good',
        ':-)': 'good',
        ':-D': 'good',
        ':D': 'good',
        ':(': 'bad',
        ':-(': 'bad',
        ':O': 'surprised',
    }

    # Substitui cada emoji pelo texto correspondente
    for emoji, emotion in emoji_dict.items():
        text = text.replace(emoji, emotion)

    return text

In [186]:
# Remoção das datas
def remove_dates(text):
    # Remove formatos comuns de datas: 14/04/2025, 04-14-2025, 2023-04-14, etc.
    text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '', text)  # Datas tipo 14/04/2025 ou 2023-04-14
    text = re.sub(r'\b\d{4}\b', '', text)  # Anos isolados (ex.: 2023)
    return text.strip()

In [187]:
# Remoção dos números
def remove_numbers(text):
    # Remove números inteiros e decimais isolados
    text = re.sub(r'\b\d+\.\d+\b', '', text)  # Números decimais (ex.: 45.67)
    text = re.sub(r'\b\d+\b', '', text)       # Números inteiros (ex.: 123)
    return text.strip()

In [188]:
# Definir a função de negação
def preprocess_negation(text):
    words = word_tokenize(text.lower())
    result = []
    negation_active = False
    for word in words:
        if word in ["not", "no", "never", "don't", "didn't", "doesn't", "won't", "wouldn't", "can't", "couldn't", "isn't", "ain't", "hasn't", "haven't", "hadn't", "wasn't", "weren't", "neither", "nor"]:
            negation_active = True
            result.append(word)
            continue
        if word in [".", ",", "!", "?", ":", ";", "but","however","although","even though","despite","in spite of"]:
            negation_active = False
            result.append(word)
            continue
        if negation_active:
            result.append("NOT_" + word)
        else:
            result.append(word)
    return ' '.join(result)

#### Funções NRC

In [189]:
# Função para NRC Lexicon (com negação e tokenização opcionais)
emolex = pd.read_csv('NCR-lexicon.csv', delimiter=";", encoding="utf-8")[['English', 'Positive', 'Negative']]

def lexicon_sentiment(text, use_negation=False, use_preprocessing=False):
    if use_negation:
        text = preprocess_negation(text)
    if use_preprocessing:
        #text = remove_duplicate_words(text)
        #text = translate_to_english(text, src_lang='en')
        text = word_based_tokenization(text)
        text = remove_newlines_and_lower(text)
        text = remove_specific_html_tags(text)
        text = remove_multiple_spaces(text)
        text = remove_dates(text)
        text = remove_numbers(text)
        text = remove_caracteres_estranhos(text)
        text = replace_emojis(text)
    words = text.split()
    pos_count = neg_count = 0
    for word in words:
        if word.startswith("NOT_"):
            original_word = word[4:]
            entry = emolex[emolex['English'] == original_word]
            if not entry.empty:
                pos_count += entry['Negative'].values[0]
                neg_count += entry['Positive'].values[0]
        else:
            entry = emolex[emolex['English'] == word]
            if not entry.empty:
                pos_count += entry['Positive'].values[0]
                neg_count += entry['Negative'].values[0]
    return 'positive' if pos_count > neg_count else 'negative'


#### Funções SentiWordNet

In [190]:
# Funções auxiliares para SentiWordNet
def convert_tag(tag):
    tag_dict = {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}
    try:
        return tag_dict[tag[0]]
    except:
        return wn.NOUN

def sentiwordnet_analysis(text, use_negation=False, use_preprocessing=False):
    if use_negation:
        text = preprocess_negation(text)
    if use_preprocessing:
        #text = remove_duplicate_words(text)
        #text = translate_to_english(text, src_lang='en')
        #text = word_based_tokenization(text)
        text = remove_newlines_and_lower(text)
        text = remove_specific_html_tags(text)
        text = remove_multiple_spaces(text)
        text = remove_dates(text)
        text = remove_numbers(text)
        text = remove_caracteres_estranhos(text)
        text = replace_emojis(text)
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    pos_score = 0
    neg_score = 0
    for word, tag in tagged:
        wn_tag = convert_tag(tag)
        if not wn_tag:
            continue
        if word.startswith("NOT_"):
            original_word = word[4:]
            synsets = list(swn.senti_synsets(original_word, wn_tag))
            if synsets:
                synset = synsets[0]
                pos_score += synset.neg_score()
                neg_score += synset.pos_score()
        else:
            synsets = list(swn.senti_synsets(word, wn_tag))
            if synsets:
                synset = synsets[0]
                pos_score += synset.pos_score()
                neg_score += synset.neg_score()
    return 'positive' if pos_score > neg_score else 'negative'

#### Funções VADER

In [191]:
# Inicializar o VADER
vader_analyzer = SentimentIntensityAnalyzer()

# Função para VADER
def vader_sentiment(text, use_negation=False, use_preprocessing=False):
    if use_negation:
        #text = remove_duplicate_words(text)
        #text = translate_to_english(text, src_lang='en')
        text = word_based_tokenization(text)
        text = remove_newlines_and_lower(text)
        text = remove_specific_html_tags(text)
        text = remove_multiple_spaces(text)
        text = remove_dates(text)
        text = remove_numbers(text)
        text = remove_caracteres_estranhos(text)
        text = replace_emojis(text)
    if use_preprocessing:
        text = word_based_tokenization(text)
    scores = vader_analyzer.polarity_scores(text)
    return 'positive' if scores['compound'] > 0 else 'negative'

### NRC

In [192]:
# Aplicação das funções feitas no código inicial
# NRC Lexicon
test_df['NRC_no_negation_or_preprocessing'] = test_df['review'].apply(lexicon_sentiment, use_negation=False, use_preprocessing=False)
test_df['NRC_no_negation_with_preprocessing'] = test_df['review'].apply(lexicon_sentiment, use_negation=False, use_preprocessing=True)
test_df['NRC_with_negation_no_preprocessing'] = test_df['review'].apply(lexicon_sentiment, use_negation=True, use_preprocessing=False)
test_df['NRC_with_negation_and_preprocessing'] = test_df['review'].apply(lexicon_sentiment, use_negation=True, use_preprocessing=True)

print("\nNRC - Sem Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['NRC_no_negation_or_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['NRC_no_negation_or_preprocessing']))
print("\nNRC - Sem Negação Com Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['NRC_no_negation_with_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['NRC_no_negation_with_preprocessing']))
print("\nNRC - Com Negação Sem Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['NRC_with_negation_no_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['NRC_with_negation_no_preprocessing']))
print("\nNRC - Com Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['NRC_with_negation_and_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['NRC_with_negation_and_preprocessing']))


NRC - Sem Negação e Pré-processamento:
Accuracy: 0.6429458005792305
              precision    recall  f1-score   support

    negative       0.42      0.45      0.44       741
    positive       0.75      0.73      0.74      1676

    accuracy                           0.64      2417
   macro avg       0.59      0.59      0.59      2417
weighted avg       0.65      0.64      0.65      2417


NRC - Sem Negação Com Pré-processamento:
Accuracy: 0.6677699627637568
              precision    recall  f1-score   support

    negative       0.45      0.40      0.43       741
    positive       0.75      0.78      0.77      1676

    accuracy                           0.67      2417
   macro avg       0.60      0.59      0.60      2417
weighted avg       0.66      0.67      0.66      2417


NRC - Com Negação Sem Pré-processamento:
Accuracy: 0.6839056681836988
              precision    recall  f1-score   support

    negative       0.48      0.50      0.49       741
    positive       0.78   

### SentiWordNet

In [193]:
# Aplicação das funções feitas no código inicial
# SentiWordNet Lexicon
test_df['SWN_no_negation_or_preprocessing'] = test_df['review'].apply(sentiwordnet_analysis, use_negation=False, use_preprocessing=False)
test_df['SWN_no_negation_with_preprocessing'] = test_df['review'].apply(sentiwordnet_analysis, use_negation=False, use_preprocessing=True)
test_df['SWN_with_negation_no_preprocessing'] = test_df['review'].apply(sentiwordnet_analysis, use_negation=True, use_preprocessing=False)
test_df['SWN_with_negation_and_preprocessing'] = test_df['review'].apply(sentiwordnet_analysis, use_negation=True, use_preprocessing=True)

print("\nSentiWordNet - Sem Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['SWN_no_negation_or_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['SWN_no_negation_or_preprocessing']))
print("\nSentiWordNet - Sem Negação Com Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['SWN_no_negation_with_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['SWN_no_negation_with_preprocessing']))
print("\nSentiWordNet - Com Negação Sem Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['SWN_with_negation_no_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['SWN_with_negation_no_preprocessing']))
print("\nSentiWordNet - Com Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['SWN_with_negation_and_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['SWN_with_negation_and_preprocessing']))


SentiWordNet - Sem Negação e Pré-processamento:
Accuracy: 0.7190732312784444
              precision    recall  f1-score   support

    negative       0.55      0.46      0.50       741
    positive       0.78      0.84      0.80      1676

    accuracy                           0.72      2417
   macro avg       0.66      0.65      0.65      2417
weighted avg       0.71      0.72      0.71      2417


SentiWordNet - Sem Negação Com Pré-processamento:
Accuracy: 0.7223831195697146
              precision    recall  f1-score   support

    negative       0.55      0.50      0.53       741
    positive       0.79      0.82      0.80      1676

    accuracy                           0.72      2417
   macro avg       0.67      0.66      0.67      2417
weighted avg       0.72      0.72      0.72      2417


SentiWordNet - Com Negação Sem Pré-processamento:
Accuracy: 0.7261067438973935
              precision    recall  f1-score   support

    negative       0.55      0.55      0.55       741

### VADER

In [194]:
# Aplicação das funções feitas no código inicial
# VADER
test_df['VADER_no_negation_or_preprocessing'] = test_df['review'].apply(vader_sentiment, use_negation=False, use_preprocessing=False)
test_df['VADER_no_negation_with_preprocessing'] = test_df['review'].apply(vader_sentiment, use_negation=False, use_preprocessing=True)
test_df['VADER_with_negation_no_preprocessing'] = test_df['review'].apply(vader_sentiment, use_negation=True, use_preprocessing=False)
test_df['VADER_with_negation_and_preprocessing'] = test_df['review'].apply(vader_sentiment, use_negation=True, use_preprocessing=True)
print("\nVADER - Sem Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['VADER_no_negation_or_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['VADER_no_negation_or_preprocessing']))
print("\nVADER - Sem Negação Com Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['VADER_no_negation_with_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['VADER_no_negation_with_preprocessing']))
print("\nVADER - Com Negação Sem Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['VADER_with_negation_no_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['VADER_with_negation_no_preprocessing']))
print("\nVADER - Com Negação e Pré-processamento:")
print("Accuracy:", accuracy_score(test_df['sentiment'], test_df['VADER_with_negation_and_preprocessing']))
print(classification_report(test_df['sentiment'], test_df['VADER_with_negation_and_preprocessing']))


VADER - Sem Negação e Pré-processamento:
Accuracy: 0.7894083574679355
              precision    recall  f1-score   support

    negative       0.78      0.44      0.56       741
    positive       0.79      0.94      0.86      1676

    accuracy                           0.79      2417
   macro avg       0.78      0.69      0.71      2417
weighted avg       0.79      0.79      0.77      2417


VADER - Sem Negação Com Pré-processamento:
Accuracy: 0.7931319817956144
              precision    recall  f1-score   support

    negative       0.79      0.44      0.57       741
    positive       0.79      0.95      0.86      1676

    accuracy                           0.79      2417
   macro avg       0.79      0.69      0.72      2417
weighted avg       0.79      0.79      0.77      2417


VADER - Com Negação Sem Pré-processamento:
Accuracy: 0.7935457178320232
              precision    recall  f1-score   support

    negative       0.79      0.44      0.57       741
    positive       0