In [1]:
import os
import copy
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
import plotly.graph_objects as go

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
@torch.no_grad()
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]


def cosine_similarity(v1, v2):
    if isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray):
        # If inputs are NumPy arrays
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    elif isinstance(v1, torch.Tensor) and isinstance(v2, torch.Tensor):
        # If inputs are PyTorch tensors
        return torch.nn.functional.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0), dim=1).item()
    else:
        # If inputs are mixed types, convert to PyTorch tensors
        v1 = torch.tensor(v1) if isinstance(v1, np.ndarray) else v1
        v2 = torch.tensor(v2) if isinstance(v2, np.ndarray) else v2
        return torch.nn.functional.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0), dim=1).item()


def rotate_vector(vector, axis, angle):
    """Rotate a vector around an axis by a given angle."""
    axis = axis / np.linalg.norm(axis)
    a = np.cos(angle / 2)
    b, c, d = -axis * np.sin(angle / 2)
    rotation_matrix = np.array([
        [a*a+b*b-c*c-d*d, 2*(b*c-a*d), 2*(b*d+a*c)],
        [2*(b*c+a*d), a*a+c*c-b*b-d*d, 2*(c*d-a*b)],
        [2*(b*d-a*c), 2*(c*d+a*b), a*a+d*d-b*b-c*c]
    ])
    return np.dot(rotation_matrix, vector)

## 1. Analyze semantic shifts via vector rotations

In [5]:
# Get embeddings for our words
words = ['happy', 'excited', 'ecstatic']
embeddings = {word: get_embedding(word) for word in words}

# Stack embeddings and move to CPU for PCA
embeddings_stack = torch.stack(list(embeddings.values())).cpu().numpy()
embeddings_2d = embeddings_stack.reshape(3, 768)  # Reshape to (3, 768)

# Use PCA to reduce to 3D for rotation and visualization
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings_2d)

# Define rotation axis (from 'happy' to 'ecstatic')
rotation_axis = reduced_embeddings[2] - reduced_embeddings[0]
rotation_axis = rotation_axis / np.linalg.norm(rotation_axis)

# Perform rotations
num_steps = 20
max_angle = np.pi/2  # 90 degrees
rotated_vectors = []
for i in range(num_steps):
    angle = (i / (num_steps - 1)) * max_angle
    rotated = rotate_vector(reduced_embeddings[0], rotation_axis, angle)
    rotated_vectors.append(rotated)


# Get embeddings for our words
#words = ['happy', 'excited', 'ecstatic']
words = ['useful', 'effective', 'appropiate']
embeddings = {word: get_embedding(word) for word in words}

# Stack embeddings and move to CPU for PCA
embeddings_stack = torch.stack(list(embeddings.values())).cpu().numpy()
embeddings_2d = embeddings_stack.reshape(3, 768)  # Reshape to (3, 768)

# Use PCA to reduce to 3D for rotation and visualization
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings_2d)

# Define rotation axis (from 'happy' to 'ecstatic')
rotation_axis = reduced_embeddings[2] - reduced_embeddings[0]
rotation_axis = rotation_axis / np.linalg.norm(rotation_axis)

# Perform rotations
num_steps = 20
max_angle = np.pi/2  # 90 degrees
rotated_vectors = []
for i in range(num_steps):
    angle = (i / (num_steps - 1)) * max_angle
    rotated = rotate_vector(reduced_embeddings[0], rotation_axis, angle)
    rotated_vectors.append(rotated)



In [6]:
# Create Plotly figure
fig = go.Figure()

# Add original word vectors
for i, word in enumerate(words):
    fig.add_trace(go.Scatter3d(
        x=[reduced_embeddings[i][0]],
        y=[reduced_embeddings[i][1]],
        z=[reduced_embeddings[i][2]],
        mode='markers',
        name=word,
        marker=dict(size=8)
    ))

# Add rotated vectors
rotated_vectors_array = np.array(rotated_vectors)
fig.add_trace(go.Scatter3d(
    x=rotated_vectors_array[:, 0],
    y=rotated_vectors_array[:, 1],
    z=rotated_vectors_array[:, 2],
    mode='markers',
    showlegend=False,
    marker=dict(
        size=4,
        color=np.arange(num_steps),
        colorscale='Plotly3',
        opacity=0.8
    )
))

fig.add_trace(go.Scatter3d(
    x=[rotated_vectors_array[:, 0][-1]],
    y=[rotated_vectors_array[:, 1][-1]],
    z=[rotated_vectors_array[:, 2][-1]],
    mode='markers',
    name=f'Rotated "{words[0]}"',
    marker=dict(
        size=5,
        color='pink',
        opacity=1
    )
))

# Add arrow to show rotation
fig.add_trace(go.Scatter3d(
    x=[reduced_embeddings[0][0], rotated_vectors[-1][0]],
    y=[reduced_embeddings[0][1], rotated_vectors[-1][1]],
    z=[reduced_embeddings[0][2], rotated_vectors[-1][2]],
    mode='lines',
    line=dict(color='red', width=5),
    name='Rotation Path'
))

# Update layout
fig.update_layout(
    title=f'Rotating "{words[0]}]" in BERT Embedding Space',
    scene=dict(
        xaxis_title='PCA Component 1',
        yaxis_title='PCA Component 2',
        zaxis_title='PCA Component 3'
    ),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

# Show the plot
fig.show()

# Print similarities for each rotation step
for i, rotated in enumerate(rotated_vectors):
    print(f"Rotation step {i+1}/{num_steps}:")
    for word in words:
        similarity = cosine_similarity(rotated, reduced_embeddings[words.index(word)])
        print(f"  Similarity to '{word}': {similarity:.4f}")
    print()

Rotation step 1/20:
  Similarity to 'useful': 1.0000
  Similarity to 'effective': 0.6261
  Similarity to 'appropiate': -0.8986

Rotation step 2/20:
  Similarity to 'useful': 0.9997
  Similarity to 'effective': 0.6267
  Similarity to 'appropiate': -0.8988

Rotation step 3/20:
  Similarity to 'useful': 0.9988
  Similarity to 'effective': 0.6284
  Similarity to 'appropiate': -0.8992

Rotation step 4/20:
  Similarity to 'useful': 0.9974
  Similarity to 'effective': 0.6311
  Similarity to 'appropiate': -0.9000

Rotation step 5/20:
  Similarity to 'useful': 0.9954
  Similarity to 'effective': 0.6350
  Similarity to 'appropiate': -0.9011

Rotation step 6/20:
  Similarity to 'useful': 0.9929
  Similarity to 'effective': 0.6399
  Similarity to 'appropiate': -0.9025

Rotation step 7/20:
  Similarity to 'useful': 0.9898
  Similarity to 'effective': 0.6459
  Similarity to 'appropiate': -0.9042

Rotation step 8/20:
  Similarity to 'useful': 0.9863
  Similarity to 'effective': 0.6528
  Similarity to

## 2. Represent analogies as rotations

### 2.1  90º rotation

In [7]:
def compute_analogy_rotation(a, b):
    """Compute the rotation matrix that transforms a to b."""
    v = np.cross(a, b)
    s = np.linalg.norm(v)
    c = np.dot(a, b)
    v_x = np.array([
        [0, -v[2], v[1]],
        [v[2], 0, -v[0]],
        [-v[1], v[0], 0]
    ])
    R = np.eye(3) + v_x + np.dot(v_x, v_x) * ((1 - c) / (s ** 2))
    return R

def apply_analogy(analogy_rotation, c):
    """Apply the analogy rotation to vector c."""
    return np.dot(analogy_rotation, c)

In [8]:
import torch
import numpy as np
from sklearn.decomposition import PCA

# Define our analogy pairs
analogies = [
    ("king", "man", "woman", "queen"),
    ("france", "paris", "japan", "tokyo"),
    ("walk", "walked", "run", "ran")
]

results = []

for a_word, b_word, c_word, d_word in analogies:
    # Get embeddings
    a = get_embedding(a_word)
    b = get_embedding(b_word)
    c = get_embedding(c_word)
    d = get_embedding(d_word)

    # Stack embeddings and move to CPU for PCA
    embeddings_stack = torch.stack([a, b, c, d]).squeeze(1).cpu().numpy()

    # Use PCA to reduce to 3D for rotation and visualization
    pca = PCA(n_components=3)
    reduced_embeddings = pca.fit_transform(embeddings_stack)

    # Unpack reduced embeddings
    a_reduced, b_reduced, c_reduced, d_reduced = reduced_embeddings

    # Compute and apply analogy rotation
    analogy_rotation = compute_analogy_rotation(a_reduced, b_reduced)
    predicted_d = apply_analogy(analogy_rotation, c_reduced)

    # Store results
    results.append({
        'words': (a_word, b_word, c_word, d_word),
        'embeddings': (a_reduced, b_reduced, c_reduced, d_reduced, predicted_d)
    })

    # Print similarities
    print(f"\nAnalogy: {a_word} is to {b_word} as {c_word} is to {d_word}")
    print(f"Similarity between predicted and actual '{d_word}': {cosine_similarity(predicted_d, d_reduced):.4f}")

    # Find closest word to predicted_d
    predicted_d_full = pca.inverse_transform(predicted_d.reshape(1, -1)).flatten()
    similarities = [(word, cosine_similarity(predicted_d_full, get_embedding(word).cpu().numpy().flatten()))
                    for word in [a_word, b_word, c_word, d_word]]
    closest_word = max(similarities, key=lambda x: x[1])[0]
    print(f"Closest word to prediction: {closest_word}")

# Cosine similarity function that handles both NumPy arrays and PyTorch tensors
def cosine_similarity(v1, v2):
    if isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray):
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    elif isinstance(v1, torch.Tensor) and isinstance(v2, torch.Tensor):
        return torch.nn.functional.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0), dim=1).item()
    else:
        v1 = torch.tensor(v1) if isinstance(v1, np.ndarray) else v1
        v2 = torch.tensor(v2) if isinstance(v2, np.ndarray) else v2
        return torch.nn.functional.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0), dim=1).item()

# Implement these functions if you haven't already
def compute_analogy_rotation(a, b):
    # Implement the rotation computation
    # This should return a rotation matrix or parameters
    diff = b - a
    return diff / np.linalg.norm(diff)

def apply_analogy(rotation, c):
    # Apply the rotation to vector c
    # This should return the rotated vector
    return c + rotation


Analogy: king is to man as woman is to queen
Similarity between predicted and actual 'queen': -0.0327
Closest word to prediction: king

Analogy: france is to paris as japan is to tokyo
Similarity between predicted and actual 'tokyo': -0.6694
Closest word to prediction: france

Analogy: walk is to walked as run is to ran
Similarity between predicted and actual 'ran': -0.6469
Closest word to prediction: walk


In [9]:
# Visualization
for i, result in enumerate(results):
    a_word, b_word, c_word, d_word = result['words']
    a, b, c, d, predicted_d = result['embeddings']

    fig = go.Figure()

    # Plot original words
    for word, embed, color in zip([a_word, b_word, c_word, d_word], [a, b, c, d],
                                  ['blue', 'red', 'green', 'purple']):
        fig.add_trace(go.Scatter3d(x=[embed[0]], y=[embed[1]], z=[embed[2]],
                                   mode='markers', name=word, marker=dict(size=8, color=color)))

    # Plot predicted D
    fig.add_trace(go.Scatter3d(x=[predicted_d[0]], y=[predicted_d[1]], z=[predicted_d[2]],
                               mode='markers', name='Predicted D',
                               marker=dict(size=8, color='orange', symbol='x')))

    # Plot rotation path
    fig.add_trace(go.Scatter3d(x=[c[0], predicted_d[0]], y=[c[1], predicted_d[1]], z=[c[2], predicted_d[2]],
                               mode='lines', name='Rotation Path', line=dict(color='black', width=3)))

    fig.update_layout(
        title=f"Analogy: {a_word} is to {b_word} as {c_word} is to {d_word}",
        scene=dict(xaxis_title='PCA 1', yaxis_title='PCA 2', zaxis_title='PCA 3'),
        width=800, height=800
    )

    fig.show()

### 2.2 Simple aritmethic transformation

In [10]:
def vector_arithmetic_analogy(a, b, c):
    """Compute d in the analogy a:b::c:d using vector arithmetic."""
    return b - a + c

# Define our analogy pairs
analogies = [
    ("king", "man", "woman", "queen"),
    ("france", "paris", "japan", "tokyo"),
    ("walk", "walked", "run", "ran")
]

results = []

for a_word, b_word, c_word, d_word in analogies:
    # Get embeddings
    a = get_embedding(a_word)
    b = get_embedding(b_word)
    c = get_embedding(c_word)
    d = get_embedding(d_word)

    # Stack embeddings and move to CPU for PCA
    embeddings_stack = torch.stack([a, b, c, d]).squeeze(1).cpu().numpy()

    # Use PCA to reduce to 3D for visualization
    pca = PCA(n_components=3)
    reduced_embeddings = pca.fit_transform(embeddings_stack)

    # Unpack reduced embeddings
    a_reduced, b_reduced, c_reduced, d_reduced = reduced_embeddings

    # Compute analogy using vector arithmetic in the original space
    predicted_d_full = vector_arithmetic_analogy(a.cpu(), b.cpu(), c.cpu())

    # Project the predicted vector to the reduced space
    predicted_d = pca.transform(predicted_d_full.cpu().numpy().reshape(1, -1))[0]

    # Store results
    results.append({
        'words': (a_word, b_word, c_word, d_word),
        'embeddings': (a_reduced, b_reduced, c_reduced, d_reduced, predicted_d)
    })

    # Print similarities
    print(f"\nAnalogy: {a_word} is to {b_word} as {c_word} is to {d_word}")
    print(f"Similarity between predicted and actual '{d_word}': {cosine_similarity(predicted_d, d_reduced):.4f}")

    # Find closest word to predicted_d
    similarities = [(word, cosine_similarity(predicted_d_full.cpu().numpy().flatten(), get_embedding(word).cpu().numpy().flatten()))
                    for word in [a_word, b_word, c_word, d_word]]
    closest_word = max(similarities, key=lambda x: x[1])[0]
    print(f"Closest word to prediction: {closest_word}")

# Ensure your cosine_similarity function can handle both NumPy arrays and PyTorch tensors
def cosine_similarity(v1, v2):
    if isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray):
        return np.dot(v1.flatten(), v2.flatten()) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    elif isinstance(v1, torch.Tensor) and isinstance(v2, torch.Tensor):
        return torch.nn.functional.cosine_similarity(v1.flatten().unsqueeze(0), v2.flatten().unsqueeze(0), dim=1).item()
    else:
        v1 = torch.tensor(v1) if isinstance(v1, np.ndarray) else v1
        v2 = torch.tensor(v2) if isinstance(v2, np.ndarray) else v2
        return torch.nn.functional.cosine_similarity(v1.flatten().unsqueeze(0), v2.flatten().unsqueeze(0), dim=1).item()


Analogy: king is to man as woman is to queen
Similarity between predicted and actual 'queen': -0.5078
Closest word to prediction: man

Analogy: france is to paris as japan is to tokyo
Similarity between predicted and actual 'tokyo': 0.4933
Closest word to prediction: japan

Analogy: walk is to walked as run is to ran
Similarity between predicted and actual 'ran': -0.0456
Closest word to prediction: run


In [11]:
# Visualization
for i, result in enumerate(results):
    a_word, b_word, c_word, d_word = result['words']
    a, b, c, d, predicted_d = result['embeddings']

    fig = go.Figure()

    # Plot original words
    for word, embed, color in zip([a_word, b_word, c_word, d_word], [a, b, c, d],
                                  ['blue', 'red', 'green', 'purple']):
        fig.add_trace(go.Scatter3d(x=[embed[0]], y=[embed[1]], z=[embed[2]],
                                   mode='markers', name=word, marker=dict(size=8, color=color)))

    # Plot predicted D
    fig.add_trace(go.Scatter3d(x=[predicted_d[0]], y=[predicted_d[1]], z=[predicted_d[2]],
                               mode='markers', name='Predicted D',
                               marker=dict(size=6, color='orange', symbol='x')))

    # Plot vector arithmetic path
    fig.add_trace(go.Scatter3d(x=[b[0], a[0], c[0], predicted_d[0]],
                               y=[b[1], a[1], c[1], predicted_d[1]],
                               z=[b[2], a[2], c[2], predicted_d[2]],
                               mode='lines', name='Vector Path',
                               line=dict(color='black', width=3, dash='dash')))

    fig.update_layout(
        title=f"Analogy: {a_word} is to {b_word} as {c_word} is to {d_word}",
        scene=dict(xaxis_title='PCA 1', yaxis_title='PCA 2', zaxis_title='PCA 3'),
        width=800, height=800
    )

    fig.show()

### 2.2. Non-linear transformation

In [12]:
class OrthogonalAnalogyTransform(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.A = nn.Parameter(torch.randn(input_dim, input_dim))
        self.b = nn.Parameter(torch.zeros(input_dim))

    def forward(self, a, b, c):
        # Cayley transform to ensure orthogonality
        W = torch.matrix_exp(self.A - self.A.t())

        diff = b - a
        transformed_diff = torch.matmul(W, diff.unsqueeze(-1)).squeeze(-1) + self.b
        return c + transformed_diff


In [13]:
# Prepare data
analogies = [
    ("king", "man", "woman", "queen"),
    ("france", "paris", "japan", "tokyo"),
    ("walk", "walked", "run", "ran"),
    ("cat", "kitten", "dog", "puppy"),
    ("good", "better", "bad", "worse")
]

# Get embeddings for all words
all_words = list(set([word for analogy in analogies for word in analogy]))
embeddings = {word: get_embedding(word).to(device) for word in all_words}

# Convert embeddings to PyTorch tensors
embed_dim = list(embeddings.values())[0].shape[1]


In [15]:
# Prepare data
X = []
y = []
for a, b, c, d in analogies:
    X.append((embeddings[a], embeddings[b], embeddings[c]))
    y.append(embeddings[d])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model and optimizer
embed_dim = 768  # We know this from the embedding shape
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OrthogonalAnalogyTransform(embed_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training parameters
n_epochs = 30
patience = 5  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
epochs_without_improvement = 0

# Training loop
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for (a, b, c), d in zip(X_train, y_train):
        optimizer.zero_grad()
        predicted_d = model(a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)).squeeze(0)
        loss = criterion(predicted_d, d)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for (a, b, c), d in zip(X_val, y_val):
            predicted_d = model(a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)).squeeze(0)
            val_loss += criterion(predicted_d, d).item()
    val_loss /= len(X_val)

    print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

# Load the best model for testing
model.load_state_dict(torch.load('best_model.pth'))

# Test on unseen analogies
test_analogies = [
    ("rich", "richer", "poor", "poorer"),
    ("fast", "faster", "slow", "slower"),
    ("young", "younger", "old", "older")
]

model.eval()
with torch.no_grad():
    for a_word, b_word, c_word, d_word in test_analogies:
        a = embeddings[a_word] if a_word in embeddings else F.normalize(get_embedding(a_word).squeeze(0), p=2, dim=0).to(device)
        b = embeddings[b_word] if b_word in embeddings else F.normalize(get_embedding(b_word).squeeze(0), p=2, dim=0).to(device)
        c = embeddings[c_word] if c_word in embeddings else F.normalize(get_embedding(c_word).squeeze(0), p=2, dim=0).to(device)
        d = embeddings[d_word] if d_word in embeddings else F.normalize(get_embedding(d_word).squeeze(0), p=2, dim=0).to(device)

        predicted_d = model(a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)).squeeze(0)

        print(f"\nTest Analogy: {a_word} is to {b_word} as {c_word} is to {d_word}")
        print(f"Shape of predicted_d: {predicted_d.shape}")
        print(f"Shape of d: {d.shape}")

        similarity = F.cosine_similarity(predicted_d.unsqueeze(0), d.unsqueeze(0), dim=0).mean().item()
        print(f"Similarity between predicted and actual '{d_word}': {similarity:.4f}")

        # Find closest word to predicted_d
        similarities = []
        for word, emb in embeddings.items():
            sim = F.cosine_similarity(predicted_d, emb, dim=0).mean().item()
            similarities.append((word, sim))

        closest_word = max(similarities, key=lambda x: x[1])[0]
        print(f"Closest word to prediction: {closest_word}")

# Check orthogonality of the learned transformation
with torch.no_grad():
    W = torch.matrix_exp(model.A - model.A.t())
    WWT = torch.mm(W, W.t())
    I = torch.eye(embed_dim, device=device)
    orthogonality_error = torch.norm(WWT - I)
    print(f"\nOrthogonality error of learned transformation: {orthogonality_error:.4e}")


Epoch 1, Train Loss: 0.1705, Val Loss: 0.0419
Epoch 2, Train Loss: 0.1264, Val Loss: 0.0420
Epoch 3, Train Loss: 0.0937, Val Loss: 0.0422
Epoch 4, Train Loss: 0.0681, Val Loss: 0.0423
Epoch 5, Train Loss: 0.0495, Val Loss: 0.0423
Epoch 6, Train Loss: 0.0371, Val Loss: 0.0422
Early stopping triggered after 6 epochs



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Test Analogy: rich is to richer as poor is to poorer
Shape of predicted_d: torch.Size([768])
Shape of d: torch.Size([768])
Similarity between predicted and actual 'poorer': 0.4115
Closest word to prediction: worse

Test Analogy: fast is to faster as slow is to slower
Shape of predicted_d: torch.Size([768])
Shape of d: torch.Size([768])
Similarity between predicted and actual 'slower': 0.4505
Closest word to prediction: man

Test Analogy: young is to younger as old is to older
Shape of predicted_d: torch.Size([768])
Shape of d: torch.Size([768])
Similarity between predicted and actual 'older': 0.3880
Closest word to prediction: kitten

Orthogonality error of learned transformation: 5.7421e-03


In [18]:
def visualize_analogy(a_word, b_word, c_word, d_word, predicted_d, embeddings):
    def get_embedding_np(word):
        if word in embeddings:
            return embeddings[word].cpu().numpy()
        else:
            return get_embedding(word).cpu().numpy()

    embeddings_list = [get_embedding_np(word) for word in [a_word, b_word, c_word, d_word]]
    embeddings_list.append(predicted_d.cpu().numpy())

    # Ensure all embeddings have the same shape
    max_dim = max(emb.shape[0] for emb in embeddings_list)
    embeddings_np = np.array([np.pad(emb, (0, max_dim - emb.shape[0])) for emb in embeddings_list])

    pca = PCA(n_components=3)
    reduced_embeddings = pca.fit_transform(embeddings_np)
    a, b, c, d, pred_d = reduced_embeddings

    fig = go.Figure()

    for word, embed, color in zip([a_word, b_word, c_word, d_word], [a, b, c, d],
                                  ['blue', 'red', 'green', 'purple']):
        fig.add_trace(go.Scatter3d(x=[embed[0]], y=[embed[1]], z=[embed[2]],
                                   mode='markers', name=word, marker=dict(size=8, color=color)))

    fig.add_trace(go.Scatter3d(x=[pred_d[0]], y=[pred_d[1]], z=[pred_d[2]],
                               mode='markers', name='Predicted D',
                               marker=dict(size=8, color='orange', symbol='star')))

    fig.add_trace(go.Scatter3d(x=[c[0], pred_d[0]], y=[c[1], pred_d[1]], z=[c[2], pred_d[2]],
                               mode='lines', name='Transformation Path',
                               line=dict(color='black', width=3, dash='dash')))

    fig.update_layout(
        title=f"Test Analogy: {a_word} is to {b_word} as {c_word} is to {d_word}",
        scene=dict(xaxis_title='PCA 1', yaxis_title='PCA 2', zaxis_title='PCA 3'),
        width=800, height=800
    )

    fig.show()

# Visualize the first test analogy
a_word, b_word, c_word, d_word = test_analogies[0]
a = embeddings[a_word] if a_word in embeddings else F.normalize(get_embedding(a_word).squeeze(0), p=2, dim=0).to(device)
b = embeddings[b_word] if b_word in embeddings else F.normalize(get_embedding(b_word).squeeze(0), p=2, dim=0).to(device)
c = embeddings[c_word] if c_word in embeddings else F.normalize(get_embedding(c_word).squeeze(0), p=2, dim=0).to(device)

with torch.no_grad():
    predicted_d = model(a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)).squeeze(0)

visualize_analogy(a_word, b_word, c_word, d_word, predicted_d, embeddings)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (5, 768) + inhomogeneous part.