# Multimodal Model


In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
import requests
from io import BytesIO
import numpy as np
import hdbscan
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
import json
from umap import UMAP
from PIL import Image

In [None]:
use_embeddings_data = True

In [None]:
json_data_fact_checking = {}
json_data_fast_check_1_fixed = {}
json_data_fast_check_2_fixed = {}

with open(f"../image_model/json_data_fact_checking.json", 'r', encoding='utf-8') as file:
    json_data_fact_checking = json.load(file)
with open(f"../image_model/json_data_fast_check_1_fixed.json", 'r', encoding='utf-8') as file:
    json_data_fast_check_1_fixed = json.load(file)
with open(f"../image_model/json_data_fast_check_2_fixed.json", 'r', encoding='utf-8') as file:
    json_data_fast_check_2_fixed = json.load(file)

In [None]:
# load model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
def extract_embeddings(pages):
    image_embeds = []
    text_embeds = []

    for page in pages:
        if len(page["images"]) == 0:
            continue

        images = []
        for image in page["images"]:
            response = requests.get(image["image"])
            image = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(image)

        batch_size = 8  # Ajusta según la memoria
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i+batch_size]
            batch_texts = [page["text"][:77]] * len(batch_images)  # Limita longitud del texto

            with torch.no_grad():
                inputs = processor(images=batch_images, text=batch_texts, return_tensors="pt", padding=True)
                outputs = model(**inputs)
                image_embeds.append(outputs.image_embeds)
                text_embeds.append(outputs.text_embeds)

    image_embeds = torch.cat(image_embeds, dim=0)
    text_embeds = torch.cat(text_embeds, dim=0)
    embeddings = torch.cat((image_embeds, text_embeds), dim=1)
    return embeddings

In [None]:
embeddings_fast_check_1 = extract_embeddings(json_data_fast_check_1_fixed["pages"])

In [None]:
embeddings_fast_check_2 = extract_embeddings(json_data_fast_check_2_fixed["pages"])

In [None]:
embeddings_fact_checking = extract_embeddings(json_data_fact_checking["pages"])

In [None]:
multimodal_embeddings = torch.cat((embeddings_fast_check_1, embeddings_fast_check_2, embeddings_fact_checking), dim=0)

In [None]:
torch.save(embeddings_fast_check_1, 'multimodal_embeddings_fast_check_1.pt')
torch.save(embeddings_fast_check_2, 'multimodal_embeddings_fast_check_2.pt')
torch.save(embeddings_fact_checking, 'multimodal_embeddings_fact_checking.pt')
torch.save(multimodal_embeddings, 'multimodal_embeddings.pt')

In [None]:
#To use saved embeddings
multimodal_embeddings = torch.load('multimodal_embeddings.pt')

In [None]:
relationships = []
image_relationships = []
for page in json_data_fast_check_1_fixed["pages"]:
    relationships.append(len(page["images"]))
    for image in page["images"]:
        image_relationships.append({"image": image, "page": page})

for page in json_data_fast_check_2_fixed["pages"]:
    relationships.append(len(page["images"]))
    for image in page["images"]:
        image_relationships.append({"image": image, "page": page})

for page in json_data_fact_checking["pages"]:
    relationships.append(len(page["images"]))
    for image in page["images"]:
        image_relationships.append({"image": image, "page": page})

amount = 0
cluster_relationships = []
for index, relation in enumerate(relationships):
    amount += relation
    if relation == 0:
        continue
    for i in range(relation):
        cluster_relationships.append(index)

In [None]:
umap_model= UMAP(n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(multimodal_embeddings.detach().numpy())

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1, cluster_selection_epsilon=0.41)
cluster_labels = clusterer.fit_predict(embeddings_2d)

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', s=10)
plt.title('Results using text and image embeddings')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster Label')
plt.show()

distances = pairwise_distances(embeddings_2d, metric='euclidean')
average_distance = np.mean(distances)
min_distance = np.min(distances[distances > 0])
max_distance = np.max(distances)

print(f"Distancia promedio: {average_distance}")
print(f"Distancia mínima: {min_distance}")
print(f"Distancia máxima: {max_distance}")

score = silhouette_score(embeddings_2d, cluster_labels)
print(f"Silhouette Score: {score}")

In [None]:
hdbscan_cluster_to_print = 0
facts_selected = []
facts_labels = []

for index, fact in enumerate(embeddings_2d):
    if cluster_labels[index] == hdbscan_cluster_to_print:
        facts_selected.append(embeddings_2d[index].tolist())
        facts_labels.append(index)

plt.scatter([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], s=10)
for x, y, label in zip([fact[0] for fact in facts_selected], [fact[1] for fact in facts_selected], facts_labels):
    plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 3), ha='center')
plt.title(f'Focusing on Cluster {hdbscan_cluster_to_print}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

for index in facts_labels:
    print(f"[{index}]   Imagen: {image_relationships[index]['image']['image']}")
    print(f"[{index}]   Pagina: {image_relationships[index]['page']['link']}")
    print("")