In [1]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, BertTokenizer, BertModel
import torch



In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import argparse
import json
import logging
import os
import random
import re
from turtle import distance
import warnings
from adjustText import adjust_text
from typing import Tuple, Iterator, List, Dict
from tqdm.notebook import tqdm
import torch.nn.functional as F

from os.path import join as pjoin
from collections import defaultdict
from scipy.stats import linregress
from torch.optim import Adam, AdamW

os.environ['PYTHONIOENCODING']='UTF-8'
os.environ['CUDA_LAUNCH_BLOCKING']=str(1)

## Notes
- fasttext installation does not work, neither with conda nor with pip
- GloVe and Word2Vec can be installed, but several things words not part of the corpus

In [3]:
# Enable automatic reloading of modules before executing code
%load_ext autoreload
%autoreload 2


# Now you can use wildcard import
import plotting as pl
from models import model as md
import utils as ut

In [20]:
# create logger
logger = logging.getLogger('ooo-modernbert')
logger.setLevel(logging.INFO)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter(
    fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y/%m/%d %H:%M:%S'
)

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

In [6]:
model_id = "answerdotai/ModernBERT-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)

In [11]:
tokenized_input1 = tokenizer.encode("woman", return_tensors="pt").to(device)
output1 = model(tokenized_input1)
e1 = output1.last_hidden_state[0]
e1 = torch.mean(e1, axis=0)

In [12]:
tokenized_input2 = tokenizer.encode("man", return_tensors="pt").to(device)
output2 = model(tokenized_input2)
e2 = output2.last_hidden_state[0]
e2 = torch.mean(e2, axis=0)

In [13]:
tokenized_input3 = tokenizer.encode("headphones", return_tensors="pt").to(device)
output3 = model(tokenized_input3)
e3 = output3.last_hidden_state[0]
e3 = torch.mean(e3, axis=0)

In [14]:
torch.sum(e1*e2)

tensor(624.7878, grad_fn=<SumBackward0>)

In [15]:
torch.sum(e1*e3)

tensor(531.2306, grad_fn=<SumBackward0>)

In [16]:
torch.sum(e2*e3)

tensor(556.2291, grad_fn=<SumBackward0>)

In [21]:
tbl_labels = pd.read_csv("data/unique_id.txt", delimiter="\\", header=None)
tbl_labels["label_id"] = np.arange(1, tbl_labels.shape[0]+1)
tbl_labels.columns = ["label", "label_id"]
new_order = ["label_id", "label"]
tbl_labels = tbl_labels[new_order]

In [23]:
l_embeddings = ut.load_avg_embeddings("Word2Vec", device)

In [13]:
l_embeddings = []
for prompt in tbl_labels["label"]:
    tokenized_input = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(tokenized_input)
    embedding = output.last_hidden_state[0]
    emb_flat = torch.mean(embedding, axis=0).detach().numpy()
    l_embeddings.append(emb_flat)

In [7]:
tokenized_input1 = tokenizer.encode("man", return_tensors="pt").to(device)
output1 = model(tokenized_input1)
e1 = output1.last_hidden_state[:, 0, :]

In [8]:
tokenized_input2 = tokenizer.encode("woman", return_tensors="pt").to(device)
output2 = model(tokenized_input2)
e2 = output2.last_hidden_state[:, 0, :]

In [9]:
tokenized_input3 = tokenizer.encode("forest", return_tensors="pt").to(device)
output3 = model(tokenized_input3)
e3 = output3.last_hidden_state[:, 0, :]

In [10]:
ut.compute_similarities(e1, e2, e3, method="odd_one_out")

(tensor([540.5472], grad_fn=<SumBackward1>),
 tensor([1202.3362], grad_fn=<SumBackward1>),
 tensor([483.3616], grad_fn=<SumBackward1>))

# Setup Modeling

In [25]:
task = "odd_one_out"
modality = "behavioral"
triplets_dir = "./data/"
lr = learning_rate = 0.001
lmbda = 0.0005
temperature = 1
num_threads = 6
device = "cpu"
batch_size = 50
sampling_method = "normal"
rnd_seed = 42
p = None
results_dir = './results/'
plots_dir = './plots/'
epochs = 10
distance_metric = "dot"

In [38]:
#load triplets into memory
train_triplets, test_triplets = ut.load_data_ID(device=device, triplets_dir=triplets_dir, testcase = False)
n_items = ut.get_nitems(train_triplets)


...Could not find any .npy files for current modality.
...Now searching for .txt files.



In [39]:
n_participants = len(np.unique(train_triplets.numpy()[:,3]))
array_embeddings = np.array(l_embeddings)
embed_dim = array_embeddings.shape[1]
tensor_avg_reps = torch.Tensor(array_embeddings)

In [40]:
embed_dim

300

In [41]:
array_embeddings.shape

(1854, 300)

In [42]:
#load train and test mini-batches
train_batches, val_batches = ut.load_batches_ID(
    train_triplets=train_triplets,
    test_triplets=test_triplets,
    average_reps=tensor_avg_reps,
    n_items=n_items,
    batch_size=batch_size,
    sampling_method=sampling_method,
    rnd_seed=rnd_seed,
    p=p,
    method="embedding",
    within_subjects=True
)

In [43]:
epochs = 2
lmbda = .01
lr = 0.01

In [44]:
temperature = torch.tensor(temperature).clone().detach()
model_weight = md.Weighted_Embedding(
    embed_size=embed_dim,
    num_participants=n_participants,
    init_weights=True
)
model_weight.to(device)
optim = Adam(model_weight.parameters(), lr=lr)

  temperature = torch.tensor(temperature).clone().detach()


In [45]:
print(f'...Creating PATHs')
print()
if results_dir == './results/':
    results_dir = os.path.join(results_dir, modality, str(lmbda), f'seed{rnd_seed:02d}')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

if plots_dir == './plots/':
    plots_dir = os.path.join(plots_dir, modality, str(lmbda), f'seed{rnd_seed}')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

model_dir = os.path.join(results_dir, 'model')

...Creating PATHs



In [46]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
start = 0
train_accs, val_accs = [], []
train_losses, val_losses = [], []
loglikelihoods = []
nneg_d_over_time = []

iter = 0
results = {}
logger.info(f'Optimization started for lambda: {lmbda}\n')

print(f'Optimization started for lambda: {lmbda}\n')
for epoch in tqdm(range(start, epochs)):
    model_weight.train()
    batch_llikelihoods = torch.zeros(len(train_batches))
    batch_closses = torch.zeros(len(train_batches))
    batch_losses_train = torch.zeros(len(train_batches))
    batch_accs_train = torch.zeros(len(train_batches))
    for i, batch in enumerate(train_batches):
        optim.zero_grad() #zero out gradients
        d = batch[0].to(device)
        ids = batch[1].to(device)
        logits = model_weight(d, ids)
        anchor, positive, negative = torch.unbind(torch.reshape(logits, (-1, 3, embed_dim)), dim=1)
        tri_loss = ut.trinomial_loss(anchor, positive, negative, task, temperature, distance_metric)
        l1_pen_ID = md.l1_regularization(model_weight, "individual_slopes.weight", "few").to(device) #L1-norm to enforce sparsity (many 0s)
        complexity_loss_ID = (lmbda/n_participants) * l1_pen_ID
        loss = tri_loss + complexity_loss_ID
        
        loss.backward()
        optim.step()
        
        batch_losses_train[i] += loss.item()
        batch_llikelihoods[i] += loss.item()
        batch_accs_train[i] += ut.choice_accuracy(anchor, positive, negative, task, distance_metric)
        iter += 1

    avg_llikelihood = torch.mean(batch_llikelihoods).item()
    avg_train_loss = torch.mean(batch_losses_train).item()
    avg_train_acc = torch.mean(batch_accs_train).item()
    
    loglikelihoods.append(avg_llikelihood)
    train_losses.append(avg_train_loss)
    train_accs.append(avg_train_acc)

2025/02/03 16:40:58 - ooo-modernbert - INFO - Optimization started for lambda: 0.01

2025/02/03 16:40:58 - ooo-modernbert - INFO - Optimization started for lambda: 0.01



Optimization started for lambda: 0.01



  0%|          | 0/2 [00:00<?, ?it/s]

In [47]:
train_accs

[0.4155126214027405, 0.412482351064682]

In [48]:
train_losses

[1.1096850633621216, 1.1079800128936768]

In [49]:
ut.validation(model_weight, val_batches, task, device, level_explanation="ID")

(1.090592622756958, 0.4242890775203705)

In [30]:
avg_llikelihood = torch.mean(batch_llikelihoods).item()
avg_train_loss = torch.mean(batch_losses_train).item()
avg_train_acc = torch.mean(batch_accs_train).item()

loglikelihoods.append(avg_llikelihood)
train_losses.append(avg_train_loss)
train_accs.append(avg_train_acc)

################################################
################ validation ####################
################################################

avg_val_loss, avg_val_acc = ut.validation(
    model_weight, val_batches, task, device, level_explanation="ID")
val_losses.append(avg_val_loss)
val_accs.append(avg_val_acc)

In [31]:
avg_val_loss

3.1451358795166016

In [None]:
file_path = r"results\ID-on-embeddings\answerdotai\ModernBERT-based\0.001\few\seed549\model\model_epoch0002.tar"

os.path.isfile(file_path)

In [None]:
m_terminal = torch.load(
    r"results\ID-on-embeddings\answerdotai\ModernBERT-based\0.001\few\seed549\model\model_epoch0002.tar"
)

In [None]:
plt.hist(m_terminal["model_state_dict"]["individual_slopes.weight"].detach().numpy()[0])

In [None]:
[plt.hist(m_terminal["model_state_dict"]["individual_slopes.weight"].detach().numpy()[i]) for i in range(0, 10)]

In [None]:
logits = m_terminal(d, ids)
anchor, positive, negative = torch.unbind(torch.reshape(logits, (-1, 3, embed_dim)), dim=1)
tri_loss = ut.trinomial_loss(anchor, positive, negative, task, temperature, distance_metric)
l1_pen_ID = md.l1_regularization(m_terminal, "individual_slopes.weight", "few").to(device) #L1-norm to enforce sparsity (many 0s)
complexity_loss_ID = (lmbda/n_participants) * l1_pen_ID
loss = tri_loss + complexity_loss_ID


In [None]:
[plt.hist(model_weight.individual_slopes(torch.LongTensor([id])).detach().numpy()[-1]) for id in range(0, 10)]

In [None]:

results\ID-on-embeddings\answerdotai\ModernBERT-based\0.001\few\seed549\model\model_epoch0002.tar