# Multimodal Model


In [2]:
import torch
from transformers import CLIPProcessor, CLIPModel
import requests
from io import BytesIO
import numpy as np
import hdbscan
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
import json
from umap import UMAP
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import matplotlib.ticker as ticker
import pandas as pd 
import embedding_data

In [29]:
json_data_fact_checking_fixed = {}
json_data_fast_check_1_fixed = {}
json_data_fast_check_2_fixed = {}
json_data_biobiochile_fixed = {}
row_data = {"pages": []}

with open(f"../../data/json_data_fact_checking_fixed.json", 'r', encoding='utf-8') as file:
    json_data_fact_checking_fixed = json.load(file)
    row_data["pages"] += json_data_fact_checking_fixed["pages"]
with open(f"../../data/json_data_fast_check_1_fixed.json", 'r', encoding='utf-8') as file:
    json_data_fast_check_1_fixed = json.load(file)
    row_data["pages"] += json_data_fast_check_1_fixed["pages"]
with open(f"../../data/json_data_fast_check_2_fixed.json", 'r', encoding='utf-8') as file:
    json_data_fast_check_2_fixed = json.load(file)
    row_data["pages"] += json_data_fast_check_2_fixed["pages"]
with open(f"../../data/json_data_biobiochile_fixed.json", 'r', encoding='utf-8') as file:
    json_data_biobiochile_fixed = json.load(file)
    row_data["pages"] += json_data_biobiochile_fixed["pages"]

In [5]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [6]:
def extract_embeddings(pages):
    image_embeds = []
    text_embeds = []

    for page in pages:
        if len(page["images"]) == 0:
            continue

        images = []
        for image in page["images"]:
            response = requests.get(image["image"])
            image = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(image)

        batch_size = 8 
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i+batch_size]
            batch_texts = [page["text"][:77]] * len(batch_images) 

            with torch.no_grad():
                inputs = processor(images=batch_images, text=batch_texts, return_tensors="pt", padding=True)
                outputs = model(**inputs)
                image_embeds.append(outputs.image_embeds)
                text_embeds.append(outputs.text_embeds)

    image_embeds = torch.cat(image_embeds, dim=0)
    text_embeds = torch.cat(text_embeds, dim=0)
    embeddings = torch.cat((image_embeds, text_embeds), dim=1)
    return embeddings

In [7]:
embeddings_fast_check_1 = extract_embeddings(json_data_fast_check_1_fixed["pages"])

In [8]:
embeddings_fast_check_2 = extract_embeddings(json_data_fast_check_2_fixed["pages"])

In [9]:
embeddings_fact_checking = extract_embeddings(json_data_fact_checking_fixed["pages"])

In [10]:
embeddings_biobiochile = extract_embeddings(json_data_biobiochile_fixed["pages"])

In [11]:
torch.save(embeddings_fast_check_1, 'multimodal_embeddings_fast_check_1.pt')
torch.save(embeddings_fast_check_2, 'multimodal_embeddings_fast_check_2.pt')
torch.save(embeddings_fact_checking, 'multimodal_embeddings_fact_checking.pt')
torch.save(embeddings_biobiochile, 'multimodal_embeddings_biobiochile.pt')

In [None]:
embeddings_fast_check_1 = torch.load("multimodal_embeddings_fast_check_1.pt")
embeddings_fast_check_2 = torch.load("multimodal_embeddings_fast_check_2.pt")
embeddings_fact_checking = torch.load("multimodal_embeddings_fact_checking.pt")
embeddings_biobiochile = torch.load("multimodal_embeddings_biobiochile.pt")

In [13]:
multimodal_embeddings = torch.cat((embeddings_fast_check_1, embeddings_fast_check_2, embeddings_fact_checking, embeddings_biobiochile), dim=0)
torch.save(multimodal_embeddings, 'multimodal_embeddings.pt')

In [None]:
#To use saved embeddings
multimodal_embeddings = torch.load('multimodal_embeddings.pt')

In [57]:
relationships = []
image_relationships = []
all_pages = json_data_fast_check_1_fixed["pages"] + json_data_fast_check_2_fixed["pages"] + json_data_fact_checking_fixed["pages"] + json_data_biobiochile_fixed["pages"]
for page in all_pages:
    relationships.append(len(page["images"]))
    for image in page["images"]:
        image_relationships.append({"image": image, "page": page})

amount = 0
cluster_relationships = []
for index, relation in enumerate(relationships):
    amount += relation
    if relation == 0:
        continue
    for i in range(relation):
        cluster_relationships.append(index)

# UMAP and HDBSCAN Representation


In [None]:
# Best values for HDBSCAN hyperparameters and UMAP
# cluster_selection_epsilon_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 2.5, 3]
# min_cluster_size_values = [2, 3, 4, 5, 6, 7, 8, 9, 10]
# min_samples_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# rondom_states = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# silhouette_scores = []
# for random_state in rondom_states:
#     umap_model= UMAP(n_components=2, random_state=random_state)
#     embeddings_2d = umap_model.fit_transform(multimodal_embeddings.detach().numpy())
#     for epsilon in cluster_selection_epsilon_values:
#         for min_cluster_size in min_cluster_size_values:
#             for min_samples in min_samples_values:
#                 clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=epsilon)
#                 cluster_labels = clusterer.fit_predict(embeddings_2d)
#                 score = silhouette_score(embeddings_2d, cluster_labels)
#                 silhouette_scores.append({"score": score, "epsilon": epsilon, "min_cluster_size": min_cluster_size, "min_samples": min_samples, "random_state": random_state})
# silhouette_scores = pd.DataFrame(silhouette_scores)
# silhouette_scores = silhouette_scores.sort_values(by='score', ascending=False)
# print(silhouette_scores.head(1)) # Best silhouette score: 0.849641, epsilon: 1.5  , min_cluster_size: 9 , min_samples: 5, random_state: 20

In [None]:
umap_model= UMAP(n_components=2, random_state=20)
embeddings_2d = umap_model.fit_transform(multimodal_embeddings.detach().numpy())

clusterer = hdbscan.HDBSCAN(min_cluster_size=9, min_samples=5, cluster_selection_epsilon=1.5)
cluster_labels = clusterer.fit_predict(embeddings_2d)

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', s=10)
plt.title('Results using Embeddings from CLIP')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
colorbar = plt.colorbar(label='Cluster Label')
colorbar.set_ticks(range(int(min(cluster_labels)), int(max(cluster_labels)) + 1))
colorbar.ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: int(x)))
plt.show()

distances = pairwise_distances(embeddings_2d, metric='euclidean')
average_distance = np.mean(distances)
min_distance = np.min(distances[distances > 0])
max_distance = np.max(distances)

print(f"Distancia promedio: {average_distance}")
print(f"Distancia mínima: {min_distance}")
print(f"Distancia máxima: {max_distance}")

score = silhouette_score(embeddings_2d, cluster_labels)
print(f"Silhouette Score: {score}")

pages_embeddings_info =[]
for index, relation in enumerate(relationships):
        for i in range(relation):
            emb = embeddings_2d[index]
            pages_embeddings_info.append(embedding_data.emb_data(emb, index, row_data["pages"][index], cluster_labels[index + i]))

In [None]:
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_relationships, cmap='viridis', s=5)
plt.title('Images Associated with Each Text')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

In [None]:
cluster_to_print = 0
embeddings_selected = []
facts_labels = []
for index, emb in enumerate(pages_embeddings_info):
    if emb.get_cluster() == cluster_to_print:
        embeddings_selected.append(emb)

plt.scatter([fact.get_embedding()[0] for fact in embeddings_selected], [fact.get_embedding()[1] for fact in embeddings_selected], s=10)

plt.title(f'Cluster {cluster_to_print} | Hybrid Embeddings')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

for emb in embeddings_selected:
    print(f"{emb.get_label()} - {emb.get_page()['link']}")

In [None]:
pages_cluster = [] 
relation_cluster = []

fact_index = 0
for img_index in facts_labels:
    if image_relationships[img_index]['page'] not in pages_cluster:
        pages_cluster.append(image_relationships[img_index]['page'])
        relation_cluster.append(fact_index)
        fact_index += 1
    else:
        relation_cluster.append(relation_cluster[pages_cluster.index(image_relationships[img_index]['page'])])

x_fact_selected = [page[0] for page in embeddings_selected]
y_fact_selected = [page[1] for page in embeddings_selected]

for x, y, label in zip(embeddings_2d[:, 0], embeddings_2d[:, 1], range(len(cluster_relationships))):
    if label == 1 or label == 0:
        plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')
plt.scatter(x_fact_selected, y_fact_selected, c=relation_cluster, cmap='viridis', s=10)
plt.title('Images Associated with Each Text')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

prev = -1
page_index = 0
for aux, index in enumerate(facts_labels):
    if prev == -1:
        prev = cluster_relationships[index]
    if prev != cluster_relationships[index]:
        prev = cluster_relationships[index]
        page_index += 1
    print(f"[{index}]   Imagen: {image_relationships[index]['image']['image']}")
    print(f"[{index}]   Pagina: {image_relationships[index]['page']['link']}")
    print(f"[{index}]   indice de página: {cluster_relationships[index]}")
    print(f"[{index}]   indice: {page_index}")
    print("")

In [None]:
hdbscan_cluster_to_print = 0
second_embeddings_selected = []
second_facts_labels = []

for index, page in enumerate(embeddings_selected):
    if clusters[index] == hdbscan_cluster_to_print:
        second_embeddings_selected.append(embeddings_selected[index])
        second_facts_labels.append(facts_labels[index])

x_fact_selected = [page[0] for page in second_embeddings_selected]
y_fact_selected = [page[1] for page in second_embeddings_selected]

for x, y, label in zip(embeddings_2d[:, 0], embeddings_2d[:, 1], range(len(cluster_relationships))):
        plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')

plt.scatter(x_fact_selected, y_fact_selected, s=10)
plt.title('Clusters using specific text and image embeddings')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

for aux, index in enumerate(second_facts_labels):
    print(f"[{index}]   Imagen: {image_relationships[index]['image']['image']}")
    print(f"[{index}]   Pagina: {image_relationships[index]['page']['link']}")
    print(f"[{index}]   indice de página: {cluster_relationships[index]}")
    print("")

In [None]:
# Una misma noticia, como tiene varias imagenes, es posible entonces que la noticia como tal esté en más de un cluster.
fact_index = 53

print(cluster_relationships)
print(cluster_labels.tolist())
if fact_index not in cluster_relationships:
    print("No se encontró la noticia en los clusters")
    exit()

hdbscan_clusters_to_print = []
for index, relation in enumerate(cluster_relationships):
    if relation == fact_index:
        print(index)
        hdbscan_clusters_to_print.append(cluster_labels[index])
hdbscan_clusters_to_print = list(set(hdbscan_clusters_to_print))

if len(hdbscan_clusters_to_print) == 0:
    print("No se encontraron clusters asociados a la noticia")
    exit()

print("clusters de HDBSCAN asociados a la noticia: ")
for cluster in hdbscan_clusters_to_print:
    print(f"[{cluster}]")

facts_selected = []
facts_labels = []
text_cluster = []
for index, fact in enumerate(embeddings_2d):
    if cluster_labels[index] == hdbscan_clusters_to_print[0]:
        facts_selected.append(embeddings_2d[index].tolist())
        facts_labels.append(index)
        text_cluster.append(cluster_relationships[index])

plt.scatter([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], c=text_cluster, cmap='viridis', s=10)
for x, y, label in zip([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], facts_labels):
    plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')
plt.title(f'Focusing on Cluster {hdbscan_clusters_to_print[0]}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()


json_data_combined = json_data_fast_check_1_fixed["pages"] + json_data_fast_check_2_fixed["pages"]
for index in facts_labels:
    print(f"[{index}]   Imagen: {image_relationships[index]['image']['image']}")
    print(f"[{index}]   Pagina: {image_relationships[index]['page']['link']}")
    print(f"[{index}]   indice de página: {cluster_relationships[index]}")
    print("")

In [None]:
facts_selected = []
facts_labels = []
color_index = []
embeddings_to_show_by_links = []
for emb in pages_embeddings_info:
    embeddings_to_show_by_links.append(emb)
    if emb.get_page()["link"] in links:
        facts_labels.append(emb.get_label())
        color_index.append(2)
        continue
    color_index.append(0)
    
colors = ['#e2a6a6', '#3e63ef', '#36fa32']
reference_data = [fact[0] for fact in facts_selected]
page_index = 0
for emb in [emb for emb in embeddings_to_show_by_links]:
    x = emb.get_embedding()[0]
    y = emb.get_embedding()[1]
    c_index = 0 
    if emb.get_label() in facts_labels:
        c_index = 1
    plt.scatter(x, y, s=10, c=colors[c_index], zorder=c_index)
    page_index += 1

plt.title(f'RoBERTa | Focusing on Specific News')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

for emb in embeddings_to_show_by_links:
    print(f"{emb.get_label()} - {emb.get_page()['link']}")

In [None]:
fact_index = [25, 25, 25, 86, 86, 94, 94, 101, 110, 130, 130]

facts_selected = []
facts_labels = []
color_index = []
for index in range(len(embeddings_2d)):
    facts_labels.append(index)
    if index in fact_index:
        color_index.append(2)
        continue
    color_index.append(0)
    
colors = ['#e2a6a6', '#3e63ef', '#36fa32']

reference_data = [fact[0] for fact in facts_selected]
plot_index = 0
for x, y, etiqueta in zip([fact[0] for fact in embeddings_2d], [fact[1] for fact in embeddings_2d], facts_labels):
    plt.scatter(x, y, s=10, c=colors[color_index[plot_index]], zorder=color_index[plot_index])
    plot_index += 1

plt.title(f'RoBERTa | Focusing on Specific News')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

for index, page in enumerate(row_data["pages"]):
    if index in facts_labels:
        print(f"[{index}] link: {page['link']}")