In [None]:
from hatespace.analysis.visualizations import softmax_kde_plot
from hatespace.analysis import IronmarchAnalysis

: 

In [3]:
import os
import json

import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from hatespace.models import TransformerArchetypal, ArchetypalHead, Tokenizer

In [7]:
def load_model_from_experiment(
    experiment_name: str,
    checkpoint_path: str = "checkpoints/archetypal",
    load_last_model: bool = False,
) -> TransformerArchetypal:
    model_path = os.path.join(checkpoint_path, experiment_name)
    configuration_path = os.path.join(model_path, "configuration.json")
    with open(configuration_path, "r") as f:
        configuration = json.load(f)
    head = ArchetypalHead(512, 768, configuration["latent_dim_size"])
    model = TransformerArchetypal.from_pretrained(
        "roberta-base", inner_embedder=head, tokenizer=Tokenizer("roberta-base", max_length=512)
    )
    if load_last_model:
        checkpoint_path = os.path.join(model_path, "checkpoint.pt")
        checkpoint = torch.load(checkpoint_path, map_location="cpu")
        model.load_state_dict(checkpoint["model"])
    else:
        checkpoint_path = os.path.join(model_path, "best_model.pt")
        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
    model.eval()
    return model

In [30]:
experiment_name = "patient-quail"
checkpoint_path = os.path.join("..", "..", "checkpoints", "archetypal")
model = load_model_from_experiment(experiment_name, checkpoint_path=checkpoint_path, load_last_model=True)
model.to("cuda:1")
tokenizer = Tokenizer("roberta-base", max_length=512)

In [28]:
from hatespace.datasets import IronMarch, DataLoader

dataset = IronMarch("../../data/iron_march")
dataloader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=10)


Loading IronMarch dataset...
Formatting posts...


100%|██████████| 217757/217757 [00:28<00:00, 7667.60it/s] 


In [33]:
embeddings = {}
for batch in tqdm(dataloader):
    tokens = tokenizer(batch['data'])
    tokens = {k: v.to("cuda:1") for k, v in tokens.items()}
    embedding = model(**tokens).embeddings.detach().cpu().numpy()
    for idx, id in enumerate(batch['id']):
        embeddings[id] = embedding[idx]


  0%|          | 0/27220 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

100%|██████████| 27220/27220 [36:21<00:00, 12.48it/s] 


In [34]:
import csv

with open("embeddings.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "embedding"])
    for id, embedding in embeddings.items():
        writer.writerow([id, list(embedding)])