In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.corpus import stopwords
import umap
from umap import UMAP
import string
from collections import Counter
import matplotlib.pyplot as plt
import re
import spacy

In [None]:
src_path = 'dataset_path/'
data = '' #hatebr, toldbr, olidbr
train_file = src_path + f'''/{data}_train_balanced.csv'''

In [None]:
train_data = pd.read_csv(train_file)

classe = 0 #0 -- neutro, 1 -- ofensivo, 2 -- discurso de odio
train_data = train_data[train_data['label'] == classe]

total_text = train_data["text"].tolist()

In [None]:
import spacy
import re
import string
from nltk.corpus import stopwords
import pandas as pd

# Load the Portuguese model
nlp = spacy.load("pt_core_news_sm")

def remove_verbs_propernouns_possessivepronouns(texto):
    texto = texto.lower()

    # Remove hyperlinks
    texto = re.sub(r'http\S+', '', texto)

    # Process text with spaCy to get tokens
    doc = nlp(texto)

    # Possessive pronouns list
    possessive_pronouns = ['meu', 'minha', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'seu', 'sua', 'seus', 'suas',
                           'nosso', 'nossa', 'nossos', 'nossas', 'vosso', 'vossa', 'vossos', 'vossas', 'desse', 'desses',
                           'dessa', 'dessas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'pra','de', 'em', 'nao']

    # List of common stopwords
    stop_words_spacy = set(spacy.lang.pt.stop_words.STOP_WORDS)
    stop_words_nltk = set(stopwords.words('portuguese'))
    stop_words = stop_words_spacy.union(stop_words_nltk)
    stop_words.add('httpurl')

    # A helper function to clean a single token
    def clean_token(token):
        if re.match(r':[\w_]+:', token.text):  # Check for emojis
            return token.text
        else:
            cleaned = ''.join([char for char in token.lemma_ if char not in string.punctuation])  # Using token.lemma_ for lemmatization
            return cleaned if cleaned else ''

    # Apply logic
    tokens_cleaned = [
        clean_token(token)
        for token in doc
        if (token.text.lower() not in stop_words
        and token.pos_ != "VERB"
        and token.pos_ != "PROPN"
        and token.text.lower() not in possessive_pronouns)
        or re.match(r':[\w_]+:', token.text)  # Keep emojis in the format :word:
    ]

    tokens_cleaned = [token for token in tokens_cleaned if len(token) > 2 and token != '']

    # Join the tokens back together
    cleaned_text = ' '.join(tokens_cleaned).strip()

    cleaned_text = cleaned_text.lower()

    # If the cleaned text is empty after removing, return None (so you can drop these rows later)
    return cleaned_text if cleaned_text else None

# Example of usage
clean_text = [remove_verbs_propernouns_possessivepronouns(text) for text in total_text]

total_text_df = pd.DataFrame({
    'text': total_text,
    'clean_text': clean_text
})

total_text_df

In [None]:
# Filter out None, empty string values, and very short strings
filtered_texts = total_text_df['clean_text'].dropna()
filtered_texts = filtered_texts[filtered_texts.str.strip() != '']
filtered_texts = filtered_texts[filtered_texts.str.len() > 2]

In [None]:
short_words = [word for text in filtered_texts.dropna() for word in str(text).split() if len(word) < 3]
short_words

In [None]:
# Create a list of all words in the clean_text column
all_words = [word for text in total_text_df['clean_text'].dropna() for word in str(text).split()]
all_words

### Frequencia

In [None]:
# Filter out None values from clean_text
filtered_clean_text = [text for text in clean_text if text is not None]

# Contar a frequência de cada palavra
word_freq = Counter(" ".join(filtered_clean_text).split())

# Selecionar as N palavras mais frequentes
N = 20
most_common_words = word_freq.most_common(N)

# Preparar os dados para o gráfico
words, frequencies = zip(*most_common_words)

# Plotar o gráfico de barras
plt.figure(figsize=(10,8))
plt.barh(words, frequencies, color='skyblue')
plt.xlabel('Frequência')
plt.ylabel('Palavras')
plt.title(f'As {N} Palavras Mais Frequentes')
plt.gca().invert_yaxis()  # Inverter o eixo y para a palavra mais frequente aparecer no topo
plt.show()


In [None]:
from collections import Counter
import pandas as pd

# Count all the words in the clean_text column
all_words = " ".join(total_text_df['clean_text'].dropna()).split()
word_counts = Counter(all_words)

# Convert the counter object to a DataFrame
word_df = pd.DataFrame(word_counts.items(), columns=['Palavra', 'Quantidade'])

# Sort the DataFrame by the 'Quantidade' column in descending order and take the top 20
word_df = word_df.sort_values(by='Quantidade', ascending=False).head(20).reset_index(drop=True)

# Display the top 20 words
word_df

### Tópicos

In [None]:
model = AutoModel.from_pretrained("melll-uff/bertweetbr")

In [None]:
umap_model = UMAP(random_state=42)

In [None]:
topic_model = BERTopic(language='portuguese', embedding_model=model,  umap_model=umap_model, calculate_probabilities=True)
topics_total, probabilities = topic_model.fit_transform(filtered_texts)

In [None]:
topic_model.get_topics()

In [None]:
topic_model.get_topics()

In [None]:
freq = topic_model.get_topic_info()

topic = pd.DataFrame()

topics = []
for i in range(0, len(topic_model.get_topic_info())):
  top = []
  name = 'Topic '+ str(i + 1)
  topic_nr = freq.iloc[i]["Topic"]
  for j in range(len(topic_model.get_topic(topic_nr))):
    top.append(topic_model.get_topic(topic_nr)[j][0])
  topics.append(top)
  topic[name] = top
topic