In [1]:
import torch
from transformers import BertTokenizerFast, BertModel

In [2]:

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


test_sentence = "This piece of music sounds relaxing."

#Tokenize the sentence and return PyTorch tensors
inputs = tokenizer(test_sentence, return_tensors="pt")

# Perform forward pass through the model
# Using no_grad() to avoid gradient computation and save memory
with torch.no_grad():
    outputs = model(**inputs)

# Extract the last hidden state (batch_size, sequence_length, hidden_size)
last_hidden_state = outputs.last_hidden_state

print("Shape of the hidden state:", last_hidden_state.shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Shape of the hidden state: torch.Size([1, 9, 768])


In [4]:
# Define template sentence and target word
template_sentence = "This piece of music sounds [MASK]."
target_word = "warm and a little bit wet"

# Replace [MASK] with the target word
filled_sentence = template_sentence.replace("[MASK]", target_word)
print("Replaced sentence:", filled_sentence)

# Tokenize the sentence and retrieve token list
inputs = tokenizer(filled_sentence, return_tensors="pt", return_offsets_mapping=True)
offset_mapping = inputs.pop("offset_mapping")  # Extract offset mapping
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
print("Tokenized result:", tokens)

# Identify tokens corresponding to the target word
target_token_ids = tokenizer(target_word, add_special_tokens=False)["input_ids"]
target_tokens = tokenizer.convert_ids_to_tokens(target_token_ids)
print("Tokens possibly corresponding to the target word:", target_tokens)

# Find contiguous token sequence for the target word
def find_sublist(tokens, sub_tokens):
    n = len(sub_tokens)
    for i in range(len(tokens) - n + 1):
        if [t.lstrip("##") for t in tokens[i:i+n]] == [st.lstrip("##") for st in sub_tokens]:
            return i, i+n
    return None

match = find_sublist(tokens, target_tokens)
if match:
    start_idx, end_idx = match
    print(f"Index range of the target word: {start_idx} to {end_idx-1}")
else:
    print("Target word not found in the token list.")

Replaced sentence: This piece of music sounds warm and a little bit wet.
Tokenized result: ['[CLS]', 'this', 'piece', 'of', 'music', 'sounds', 'warm', 'and', 'a', 'little', 'bit', 'wet', '.', '[SEP]']
Tokens possibly corresponding to the target word: ['warm', 'and', 'a', 'little', 'bit', 'wet']
Index range of the target word: 6 to 11


In [5]:
# Compute hidden states using the model
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_state = outputs.last_hidden_state  # Shape: (1, sequence_length, hidden_size)
print("Shape of hidden state:", last_hidden_state.shape)

# Extract token vectors for the target word and compute the average
if match:
    target_vectors = last_hidden_state.squeeze()[start_idx:end_idx]  # (sequence_length, hidden_size)
    print(target_vectors.shape)
    avg_vector = target_vectors.mean(dim=0)
    print("Shape of averaged vector for target word/phrase:", avg_vector.shape)

Shape of hidden state: torch.Size([1, 14, 768])
torch.Size([6, 768])
Shape of averaged vector for target word/phrase: torch.Size([768])


In [7]:
avg_vector

tensor([-4.5907e-01, -3.7284e-01,  2.6824e-01,  4.0523e-01, -2.1251e-01,
         2.2506e-01, -1.6994e-01, -1.0682e-01,  6.3423e-02, -1.9893e-01,
         6.2298e-01, -3.0936e-01, -6.1058e-02,  8.7082e-01, -2.3974e-01,
         5.7228e-01,  1.9898e-01, -1.9546e-01,  3.1596e-01,  4.6010e-01,
         3.2079e-01, -2.7677e-01, -8.5889e-01,  8.5149e-01,  4.4705e-01,
        -1.1331e-01, -1.0588e-01, -8.7015e-02,  6.7179e-03, -3.3243e-02,
         7.4687e-01,  9.1954e-02,  2.4284e-02, -1.8500e-01, -4.0211e-01,
        -2.0087e-01, -7.3337e-02, -2.6264e-01, -5.2921e-01,  8.7437e-02,
        -7.1026e-01, -3.6878e-01, -4.3237e-01, -1.6795e-01, -1.7243e-02,
        -2.1803e-01,  4.6858e-01, -2.7176e-01, -4.1526e-01, -5.1694e-01,
         2.2680e-01, -4.1506e-01, -2.7317e-01, -2.5298e-01,  2.8225e-01,
         5.0876e-01,  3.7347e-01, -6.5608e-01, -4.7929e-01,  8.3925e-02,
        -5.8221e-02,  1.0977e-01,  8.7295e-02, -1.1505e+00,  1.0869e-01,
         2.7554e-01, -1.1075e-01, -2.9218e-01, -5.1

In [4]:
import torch.nn.functional as F

def extract_target_vector(template, target_word):
    """
    Replace placeholder in template sentence, compute the average vector for the target word.
    """
    filled_sentence = template.replace("[MASK]", target_word)
    inputs = tokenizer(filled_sentence, return_tensors="pt", return_offsets_mapping=True)
    offset_mapping = inputs.pop("offset_mapping")  # Extract offset mapping
    
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    target_token_ids = tokenizer(target_word, add_special_tokens=False)["input_ids"]
    target_tokens = tokenizer.convert_ids_to_tokens(target_token_ids)
    
    # Locate the target word in the tokenized sentence
    def find_sublist(tokens, sub_tokens):
        n = len(sub_tokens)
        for i in range(len(tokens) - n + 1):
            if [t.lstrip("##") for t in tokens[i:i+n]] == [st.lstrip("##") for st in sub_tokens]:
                return i, i+n
        return None
    
    match = find_sublist(tokens, target_tokens)
    if not match:
        raise ValueError("Target word tokens not found in the sentence.")
    start_idx, end_idx = match

    # Forward pass to obtain hidden states
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state  # (1, sequence_length, hidden_size)

    # Extract and average target token vectors
    target_vectors = last_hidden_state.squeeze()[start_idx:end_idx]
    avg_vector = target_vectors.mean(dim=0)
    return avg_vector

# Define template sentence
template_sentence = "I make this music sound [MASK] by manipulating its frequency content."

word1 = "warm"
word2 = "heat"
vector1 = extract_target_vector(template_sentence, word1)
vector2 = extract_target_vector(template_sentence, word2)

# Compute cosine similarity
cos_sim = F.cosine_similarity(vector1.unsqueeze(0), vector2.unsqueeze(0))
print(word1, "and", word2, "cosine similarity:", cos_sim.item())

warm and heat cosine similarity: 0.7890943288803101


In [6]:
# Define multiple template sentences (more can be added if needed)
# templates = [
#     "This piece of music exhibits a [MASK] timbre that shapes its overall emotional character.",
#     "The sound texture of this track is distinctly [MASK], giving it a unique mood.",
#     "The overall vibe of this music feels [MASK].",
#     "The sound of this track is [MASK].",
#     "I want to make the timbre of this audio sound [MASK].",
#     "The producer has processed this audio to make it [MASK]!"
# ]
templates = [
    "This sound feels [MASK].",  
    "The overall tone of this music is [MASK].",  
    "This audio conveys a [MASK] emotion, shaping the listener's perception.",  
    "The resonance and texture of this recording are distinctly [MASK].",  
    "The spectral balance and tonal quality of this audio feel [MASK], defining its timbre.",  
    "The post-processing techniques have enhanced the sound, making it [MASK]." 
]

# Compute target word vectors across multiple templates
def compute_avg_vector(target_word, templates):
    vectors = [extract_target_vector(template, target_word) for template in templates]
    return torch.stack(vectors, dim=0).mean(dim=0)  # Average across all templates

target_word1 = "airy"
target_word2 = "heat"
vector1 = compute_avg_vector(target_word1, templates)
vector2 = compute_avg_vector(target_word2, templates)

# Compute cosine similarity
cos_sim = F.cosine_similarity(vector1.unsqueeze(0), vector2.unsqueeze(0))
print(target_word1, "and", target_word2, "cosine similarity:", cos_sim.item())

airy and heat cosine similarity: 0.6337920427322388


In [7]:
vector1

tensor([-5.0819e-01, -1.5756e-01,  6.3468e-01,  1.8915e-02,  1.3520e-01,
         1.2840e-01, -5.1622e-01,  1.9215e-02,  1.7372e-01, -4.0330e-01,
        -2.5292e-01, -8.3522e-01, -1.3366e-01,  7.3927e-01,  4.9502e-02,
         8.6539e-01,  7.8783e-02, -2.7989e-01, -3.2030e-01, -2.2198e-01,
         1.4575e-01, -8.1287e-02, -3.7693e-01,  5.8363e-01,  2.1823e-01,
         1.5318e-01,  6.4739e-02,  5.1906e-01, -3.5769e-01, -4.3241e-01,
         2.3595e-01,  9.1247e-01, -3.3900e-01, -6.1250e-01,  1.4143e-01,
        -9.6173e-02, -3.6341e-01, -3.1309e-01, -1.2245e-01,  1.2732e-01,
        -5.2788e-01, -5.2272e-01,  3.5619e-01, -2.8133e-01, -1.1093e-01,
        -1.3878e-01, -1.7597e-03, -3.7658e-01, -4.2558e-01, -4.8814e-01,
        -3.5316e-01, -1.6276e-03,  1.4911e-02, -1.2625e-01, -1.3389e-02,
         5.6943e-01, -2.0746e-02, -7.1797e-01,  8.7373e-02, -3.8926e-01,
        -1.9736e-01, -7.0809e-01,  1.9106e-01, -6.4776e-01, -2.2754e-01,
         1.9925e-01, -2.1189e-01, -1.0870e+00, -4.8