In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
src_path = 'dataset_path/'
data = '' #hatebr, toldbr, olidbr
train_file = src_path + f'''/{data}_train_balanced.csv'''
test_file = src_path + f'''/{data}_test.csv'''

In [None]:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

classe = 0 #0 -- neutro, 1 -- ofensivo, 2 -- discurso de odio
train_data = train_data[train_data['label'] == classe] 


train_texts = train_data["text"].tolist()
test_texts = test_data["text"].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('melll-uff/bertweetbr')
model = AutoModel.from_pretrained('melll-uff/bertweetbr')
num_pal = 9 #mudar em relacao ao numero de tokens

In [None]:
def get_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

In [None]:
train_embeddings = get_embeddings(train_data['text'])
train_embeddings_norm = normalize(train_embeddings, norm='l2')

In [None]:
test_embeddings = get_embeddings(test_data['text'])
test_embeddings_norm = normalize(test_embeddings, norm='l2')

In [None]:
wcss = []
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(train_embeddings_norm)
    wcss.append(kmeans.inertia_)


plt.plot(range(1, 11), wcss)
plt.title('Regra do Cotovelo')
plt.xlabel('Número de clusters')
plt.ylabel('WCSS')  
plt.show()

In [None]:
def calculate_wcss(train_embeddings_norm):
    wcss = []
    for n in range(1, 11):
        kmeans = KMeans(n_clusters=n, random_state=42)
        kmeans.fit(train_embeddings_norm)
        wcss.append(kmeans.inertia_)
    return wcss

def find_elbow(wcss):
    x1, y1 = 1, wcss[0]
    x2, y2 = len(wcss), wcss[-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i + 1
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator / denominator)

    return distances.index(max(distances)) + 1

wcss = calculate_wcss(train_embeddings_norm)
elbow_point = find_elbow(wcss)

print(f'O ponto de cotovelo é {elbow_point} clusters.')

In [None]:
sil_scores = []
for i in range(2, 11):  # o número mínimo de clusters deve ser 2
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(train_embeddings_norm)
    sil_score = silhouette_score(train_embeddings_norm, kmeans.labels_)
    sil_scores.append(sil_score)

plt.plot(range(2, 11), sil_scores)
plt.title('Silhouette Score')
plt.xlabel('Número de clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
n_clusters = elbow_point

kmeans_train = KMeans(n_clusters=n_clusters, random_state=0).fit(train_embeddings_norm)
centroids_train = kmeans_train.cluster_centers_
train_labels = kmeans_train.fit_predict(train_embeddings_norm)

### Similaridade

In [None]:
def find_nearest_and_farthest_clusters(train_embeddings, train_labels, test_embeddings, n_clusters):
    cluster_distances = []
    for i in range(n_clusters):
        cluster_indices = np.where(train_labels == i)[0]
        cluster_embeddings = train_embeddings[cluster_indices]

        # Calculando a distância média do cluster aos textos de teste
        distances_to_test = [1 - cosine_similarity([cluster_center], [test_emb])[0][0]
                             for test_emb in test_embeddings for cluster_center in cluster_embeddings]
        cluster_avg_distance = np.mean(distances_to_test)
        cluster_distances.append((i, cluster_avg_distance))

    # Ordenando os clusters pela distância média
    cluster_distances.sort(key=lambda x: x[1])

    nearest_cluster, _ = cluster_distances[0]
    farthest_cluster, _ = cluster_distances[-1]

    nearest_cluster_indices = np.where(train_labels == nearest_cluster)[0]
    farthest_cluster_indices = np.where(train_labels == farthest_cluster)[0]

    return nearest_cluster_indices, farthest_cluster_indices

nearest_indices, farthest_indices = find_nearest_and_farthest_clusters(train_embeddings_norm, train_labels, test_embeddings_norm, n_clusters)

In [None]:
nearest_indices, farthest_indices = find_nearest_and_farthest_clusters(train_embeddings_norm, train_labels, test_embeddings_norm, n_clusters)

nearest_texts = train_data.iloc[nearest_indices]['text'] 
farthest_texts = train_data.iloc[farthest_indices]['text']

print("Textos mais próximos:")
print(nearest_texts.head())
print("\nTextos mais distantes:")
print(farthest_texts.head())

In [None]:
def calculate_average_distance_to_test(cluster_indices, test_embeddings, train_embeddings):
    distances = [1 - cosine_similarity([train_emb], [test_emb])[0][0]
                 for train_emb in train_embeddings[cluster_indices]
                 for test_emb in test_embeddings]
    return np.mean(distances)

def get_texts_sorted_by_distance(cluster_indices, train_data, test_embeddings, train_embeddings, ascending=True, n=2):
    distances = [calculate_average_distance_to_test([idx], test_embeddings, train_embeddings)
                 for idx in cluster_indices]
    sorted_indices = [x for _, x in sorted(zip(distances, cluster_indices), key=lambda pair: pair[0], reverse=not ascending)]
    sorted_texts = train_data.iloc[sorted_indices][:n]['text']  
    return sorted_texts

In [None]:
nearest_texts_sorted = get_texts_sorted_by_distance(nearest_indices, train_data, test_embeddings_norm, train_embeddings_norm, ascending=True, n=2)
farthest_texts_sorted = get_texts_sorted_by_distance(farthest_indices, train_data, test_embeddings_norm, train_embeddings_norm, ascending=False, n=2)

# Visualizando os textos ordenados
print("Textos mais próximos, ordenados por proximidade:")
print(nearest_texts_sorted.head())
print("\nTextos mais distantes, ordenados por proximidade:")
print(farthest_texts_sorted.head())

### Tamanho

In [None]:
def non_stopword_count(text, stopwords):
    words = text.split()
    non_stopword_count = sum(1 for word in words if word.lower() not in stopwords)
    return non_stopword_count

def word_count(text):
    words = text.split()
    return len(words)

def get_texts_sorted_by_distance_filtered(cluster_indices, train_data, test_embeddings, train_embeddings, ascending=True, n=2):
    distances = [calculate_average_distance_to_test([idx], test_embeddings, train_embeddings)
                 for idx in cluster_indices]
    sorted_indices = [x for _, x in sorted(zip(distances, cluster_indices), key=lambda pair: pair[0], reverse=not ascending)]

    filtered_indices = [idx for idx in sorted_indices if word_count(train_data.iloc[idx]['text']) == num_pal]

    sorted_texts = train_data.iloc[filtered_indices][:n]['text']

    return sorted_texts

In [None]:
def token_count(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    return len(tokens)

def get_texts_sorted_by_distance_filtered(cluster_indices, train_data, test_embeddings, train_embeddings, tokenizer, ascending=True, n=2, num_tokens=50):
    distances = [calculate_average_distance_to_test([idx], test_embeddings, train_embeddings)
                 for idx in cluster_indices]
    sorted_indices = [x for _, x in sorted(zip(distances, cluster_indices), key=lambda pair: pair[0], reverse=not ascending)]

    filtered_indices = [idx for idx in sorted_indices if token_count(train_data.iloc[idx]['text'], tokenizer) == num_tokens]

    sorted_texts = train_data.iloc[filtered_indices][:n]['text']

    return sorted_texts

In [None]:
nearest_texts_sorted_filtered = get_texts_sorted_by_distance_filtered(nearest_indices, train_data, test_embeddings_norm, train_embeddings_norm, ascending=True, n=4)

print(nearest_texts_sorted_filtered.head())

In [None]:
farthest_texts_sorted_filtered = get_texts_sorted_by_distance_filtered(farthest_indices, train_data, test_embeddings_norm, train_embeddings_norm, ascending=False, n=4)

print(farthest_texts_sorted_filtered.head())