# Numeric Representation of Text


## Imports


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import plotly.graph_objects as go
import pandas as pd
from sklearn.manifold import TSNE
import json
import umap.umap_ as umap
import hdbscan
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
import matplotlib.patches as mpatches

## 1. Test with Real Data (181 data)

### 1.1. Loading Data


### 1.2. Loading Fast Check Data


In [None]:
json_data = {}

with open("../image_model/json_data_fact_checking.json", 'r', encoding='utf-8') as file:
    json_data = json.load(file)  
fact_checking_text = [fact["text"] for fact in json_data["pages"]]

with open("../image_model/json_data_fast_check_1_fixed.json", 'r', encoding='utf-8') as file:
    json_data = json.load(file)  
text_1st_process = [fact["text"] for fact in json_data["pages"]]

with open("../image_model/json_data_fast_check_2_fixed.json", 'r', encoding='utf-8') as file:
    json_data = json.load(file)  
text_2nd_process = [fact["text"] for fact in json_data["pages"]]

### 1.3. Modeling with BERT


In [None]:
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, truncation=False)
model = BertModel.from_pretrained(model_name)
model.eval()

def get_embeddings(texts):
    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**encodings)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [None]:
embeddings_1st = get_embeddings(text_1st_process)

In [None]:
embeddings_2st = get_embeddings(text_2nd_process)

In [None]:
embeddings_fact_checking = get_embeddings(fact_checking_text)

In [None]:
torch.save(embeddings_1st, '../embeddings/1st_text_embeddings.pt')
torch.save(embeddings_2st, '../embeddings/2nd_text_embeddings.pt')
torch.save(embeddings_fact_checking, '../embeddings/fact_checking_text_embeddings.pt')

In [None]:
embeddings_1st = torch.load('../embeddings/1st_text_embeddings.pt')
embeddings_2st = torch.load('../embeddings/2nd_text_embeddings.pt')
embeddings_fact_checking = torch.load('../embeddings/fact_checking_text_embeddings.pt')
embeddings = np.concatenate([embeddings_1st, embeddings_2st, embeddings_fact_checking])

print(embeddings_1st.shape)
print(embeddings_2st.shape)
print(embeddings_fact_checking.shape)
print(embeddings.shape)

### 1.3.1 Modeling with BERT - t-SNE 2D Visualization


In [None]:
tsne = TSNE(n_components=2, perplexity=1, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=3, cluster_selection_epsilon=10)
cluster_labels = clusterer.fit_predict(embeddings_2d)

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', s=10)
plt.title('Results using t-SNE and HDBSCAN')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

distances = pairwise_distances(embeddings_2d, metric='euclidean')
average_distance = np.mean(distances)
min_distance = np.min(distances[distances > 0])
max_distance = np.max(distances)

print(f"Distancia promedio: {average_distance}")
print(f"Distancia mínima: {min_distance}")
print(f"Distancia máxima: {max_distance}")

score = silhouette_score(embeddings_2d, cluster_labels)
print(f"Silhouette Score: {score}")

In [None]:
colors = {'fact_checking_data': 'red', 'fast_check_1st': 'blue', 'fast_check_2nd': 'green'}

cluster_to_print = -1
facts_selected = []
facts_labels = []

for index, fact in enumerate(embeddings_2d):
    if cluster_labels[index] == cluster_to_print:
        facts_selected.append(embeddings_2d[index].tolist())
        facts_labels.append(index)

plt.scatter([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], s=10)
for x, y, etiqueta in zip([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], facts_labels):
    plt.annotate(etiqueta, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')

legend_labels = [mpatches.Patch(color=color, label=label) for label, color in colors.items()]
plt.legend(handles=legend_labels, title="Sites", fontsize=8)
plt.title(f'Focusing on Cluster {cluster_to_print}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

### 1.3.2 Modeling with BERT - UMAP 2D Visualization


In [None]:
umap_model = umap.UMAP(n_components=2, random_state=10, n_neighbors=3, min_dist=0.0, n_jobs=1, init='random')
embeddings_2d = umap_model.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, cluster_selection_epsilon=0.5)
cluster_labels = clusterer.fit_predict(embeddings_2d)

# Visualizar los resultados
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, s=10)
plt.title('Results using UMAP and HDBSCAN')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

distances = pairwise_distances(embeddings_2d, metric='euclidean')
# Calcular la distancia promedio entre todos los puntos en el espacio 2D
average_distance = np.mean(distances)
# Calcular la distancia mínima entre puntos
min_distance = np.min(distances[distances > 0])  # Excluyendo la diagonal (distancia de un punto consigo mismo)
# Calcular la distancia máxima entre puntos
max_distance = np.max(distances)
print(f"Distancia promedio: {average_distance}")
print(f"Distancia mínima: {min_distance}")
print(f"Distancia máxima: {max_distance}")

# Evaluar la calidad del clustering
score = silhouette_score(embeddings_2d, cluster_labels)
print(f"Silhouette Score: {score}")

In [None]:

colors = {'fact_checking_data': 'red', 'fast_check_1st': 'blue', 'fast_check_2nd': 'green'}

cluster_to_print = 1
facts_selected = []
facts_labels = []

for index, fact in enumerate(embeddings_2d):
    if cluster_labels[index] == cluster_to_print:
        facts_selected.append(embeddings_2d[index].tolist())
        facts_labels.append(index)

plt.scatter([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], s=10)

for x, y, etiqueta in zip([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], facts_labels):
    plt.annotate(etiqueta, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')

legend_labels = [mpatches.Patch(color=color, label=label) for label, color in colors.items()]
plt.legend(handles=legend_labels, title="Sites", fontsize=8)

plt.title(f'Focusing on Cluster {cluster_to_print}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()