In [9]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
from transformers import AutoTokenizer, AutoModel
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
def get_embedding(word, context):
    inputs = tokenizer(context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    word_tokens = tokenizer.tokenize(word)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    word_index = inputs.input_ids[0].tolist().index(word_ids[0])
    return outputs.last_hidden_state[0, word_index, :].numpy()

def context_transformation(context, T_type):
    if T_type == "tense_change":
        # Simple tense change: present to past
        return context.replace("is", "was")
    elif T_type == "add_adjectives":
        # Add an adjective before the first noun
        words = context.split()
        for i, word in enumerate(words):
            if word.lower() in ["house", "car", "tree", "book"]:  # Example nouns
                words.insert(i, "beautiful")
                break
        return " ".join(words)
    else:
        return context  # No transformation if type is not recognized

def meaning_transformation_F(M_w_c, T):
    # Implement the F function
    # This is a placeholder linear transformation
    np.random.seed(0)  # For reproducibility
    A_T = np.random.rand(768, 768)  # For BERT's 768-dim embeddings
    b_T = np.random.rand(768)
    return np.dot(A_T, M_w_c) + b_T

# Test the covariance
word = "house"
original_context = "The house is big."
transformed_context = context_transformation(original_context, "tense_change")

print(f"Original context: {original_context}")
print(f"Transformed context: {transformed_context}")

M_w_c = get_embedding(word, original_context)
M_w_Tc = get_embedding(word, transformed_context)

predicted_M_w_Tc = meaning_transformation_F(M_w_c, "tense_change")

# Compare M_w_Tc and predicted_M_w_Tc
similarity = np.dot(M_w_Tc, predicted_M_w_Tc) / (np.linalg.norm(M_w_Tc) * np.linalg.norm(predicted_M_w_Tc))
print(f"Cosine similarity between actual and predicted transformed embeddings: {similarity}")

# Additional analysis
print(f"\nEuclidean distance between original and transformed embeddings: {np.linalg.norm(M_w_c - M_w_Tc)}")
print(f"Euclidean distance between transformed and predicted embeddings: {np.linalg.norm(M_w_Tc - predicted_M_w_Tc)}")

# Visualize the first few dimensions of the embeddings
print("\nFirst 10 dimensions of embeddings:")
print("Original:    ", M_w_c[:10])
print("Transformed: ", M_w_Tc[:10])
print("Predicted:   ", predicted_M_w_Tc[:10])

Original context: The house is big.
Transformed context: The house was big.
Cosine similarity between actual and predicted transformed embeddings: 0.031218016020476986

Euclidean distance between original and transformed embeddings: 3.9046308994293213
Euclidean distance between transformed and predicted embeddings: 157.35978777902625

First 10 dimensions of embeddings:
Original:     [ 1.1914905   0.20965098  0.6337387  -0.4308313   0.9291093  -0.42965057
 -0.0608275   0.21266815 -0.7103103  -0.48505405]
Transformed:  [ 1.0322326   0.44214618  0.2158463  -0.39376912  0.7595385  -0.3879153
  0.02685729  0.18928054 -0.52789253 -0.50278306]
Predicted:    [-1.34185679 -5.98520063  0.53370562 -6.33069443 -3.62910611 -0.52153571
 -0.58382468 -6.89555114 -0.46753446 -0.76195684]


In [4]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def normalized_euclidean(a, b):
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return np.linalg.norm(a_norm - b_norm)

def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

# Using the same embeddings from your previous result
M_w_c = np.array([ 1.1914905, 0.20965098, 0.6337387, -0.4308313, 0.9291093, -0.42965057, -0.0608275, 0.21266815, -0.7103103, -0.48505405])
M_w_Tc = np.array([ 1.0322326, 0.44214618, 0.2158463, -0.39376912, 0.7595385, -0.3879153, 0.02685729, 0.18928054, -0.52789253, -0.50278306])

# Extend these to 768 dimensions for this example (you would use the full embeddings in practice)
M_w_c = np.tile(M_w_c, 77)[:768]
M_w_Tc = np.tile(M_w_Tc, 77)[:768]

print(f"Cosine Similarity: {cosine_similarity(M_w_c, M_w_Tc)}")
print(f"Normalized Euclidean Distance: {normalized_euclidean(M_w_c, M_w_Tc)}")
print(f"Manhattan Distance: {manhattan_distance(M_w_c, M_w_Tc)}")

# Dimension Reduction
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(np.vstack([M_w_c, M_w_Tc]))
pca_distance = np.linalg.norm(embeddings_2d[0] - embeddings_2d[1])
print(f"PCA-reduced Euclidean Distance: {pca_distance}")

# Relative Distance
random_vec = np.random.rand(768)
relative_distance = np.linalg.norm(M_w_c - M_w_Tc) / np.linalg.norm(M_w_c - random_vec)
print(f"Relative Distance: {relative_distance}")

Cosine Similarity: 0.9646599941462117
Normalized Euclidean Distance: 0.26585712649386883
Manhattan Distance: 105.23078883
PCA-reduced Euclidean Distance: 5.020251923686917
Relative Distance: 0.22875415606293376


In [5]:
def evaluate_transformation(word, context, T_type):
    original_context = context
    transformed_context = context_transformation(context, T_type)

    M_w_c = get_embedding(word, original_context)
    M_w_Tc = get_embedding(word, transformed_context)
    predicted_M_w_Tc = meaning_transformation_F(M_w_c, T_type)

    cosine_sim = np.dot(M_w_Tc, predicted_M_w_Tc) / (np.linalg.norm(M_w_Tc) * np.linalg.norm(predicted_M_w_Tc))
    euclidean_dist = np.linalg.norm(M_w_Tc - predicted_M_w_Tc)

    return cosine_sim, euclidean_dist

# Test with multiple words
words = ["house", "car", "run", "eat", "think"]
contexts = [
    "The {} is big.",
    "The {} is fast.",
    "They {} quickly.",
    "They {} healthy food.",
    "They {} deeply about the problem."
]

results = []
for word, context in zip(words, contexts):
    context = context.format(word)
    cosine_sim, euclidean_dist = evaluate_transformation(word, context, "tense_change")
    results.append((word, cosine_sim, euclidean_dist))

# Analyze results
for word, cosine_sim, euclidean_dist in results:
    print(f"{word}: Cosine Similarity = {cosine_sim:.4f}, Euclidean Distance = {euclidean_dist:.4f}")

# Calculate average performance
avg_cosine = np.mean([r[1] for r in results])
avg_euclidean = np.mean([r[2] for r in results])
print(f"\nAverage: Cosine Similarity = {avg_cosine:.4f}, Euclidean Distance = {avg_euclidean:.4f}")

house: Cosine Similarity = 0.0312, Euclidean Distance = 157.3598
car: Cosine Similarity = 0.0016, Euclidean Distance = 148.0092
run: Cosine Similarity = 0.0344, Euclidean Distance = 157.0768
eat: Cosine Similarity = 0.0218, Euclidean Distance = 144.7155
think: Cosine Similarity = 0.0079, Euclidean Distance = 150.5001

Average: Cosine Similarity = 0.0194, Euclidean Distance = 151.5323


## Infer function F from similar tense changes

In [6]:
def get_embedding(word, context):
    # Tokenize the full context
    inputs = tokenizer(context, return_tensors="pt")

    # Tokenize the word alone
    word_tokens = tokenizer.tokenize(word)

    # Get tokens for the context
    context_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

    # Find the position of the word in the context
    start_index = None
    for i, token in enumerate(context_tokens):
        if token in word_tokens or any(wt in token for wt in word_tokens):
            start_index = i
            break

    if start_index is None:
        raise ValueError(f"Could not find '{word}' or any of its subwords in the context")

    # Get the embedding
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state[0, start_index, :].numpy()

In [7]:
# Generate a larger dataset
words_and_contexts = [
    ("run", "They {} quickly in the park."),
    ("eat", "We {} delicious food at the restaurant."),
    ("think", "She {} deeply about the problem."),
    ("play", "Children {} happily in the playground."),
    ("work", "He {} diligently on the project."),
    ("study", "Students {} hard for their exams."),
    ("dance", "The couple {} gracefully at the ball."),
    ("sing", "The choir {} beautifully during the concert."),
    ("write", "The author {} a new novel every year."),
    ("read", "She {} interesting books in her free time."),
    ("swim", "They {} in the ocean every summer."),
    ("cook", "He {} delicious meals for his family."),
    ("paint", "The artist {} stunning landscapes."),
    ("teach", "She {} mathematics at the university."),
    ("build", "The company {} new offices downtown."),
    ("grow", "These plants {} well in sunlight."),
    ("fly", "The birds {} south for the winter."),
    ("sleep", "The baby {} peacefully in the crib."),
    ("laugh", "We {} at the comedian's jokes."),
    ("cry", "The child {} when he lost his toy."),
    ("house", "The {} is spacious and comfortable."),
    ("car", "The {} is fast and efficient."),
    ("tree", "The {} is tall and provides shade."),
    ("book", "The {} is interesting and informative."),
    ("computer", "The {} is powerful and user-friendly."),
    ("phone", "The {} is sleek and has many features."),
    ("chair", "The {} is comfortable and well-designed."),
    ("table", "The {} is sturdy and spacious."),
    ("picture", "The {} is beautiful and well-framed."),
    ("window", "The {} is large and lets in plenty of light."),
    ("happy", "She {} about her recent promotion."),
    ("sad", "He {} about the loss of his pet."),
    ("excited", "They {} about the upcoming vacation."),
    ("tired", "I {} after working long hours."),
    ("hungry", "We {} before dinner time."),
    ("thirsty", "She {} after the long run."),
    ("angry", "He {} about the unfair treatment."),
    ("surprised", "They {} by the unexpected news."),
    ("confused", "I {} by the complex instructions."),
    ("worried", "She {} about her upcoming exam.")
]

In [8]:
def get_embedding(word, context):
    # Tokenize the full context
    inputs = tokenizer(context, return_tensors="pt")

    # Tokenize the word alone
    word_tokens = tokenizer.tokenize(word)

    # Get word IDs for the context
    word_ids = inputs.word_ids()[1:-1]  # Exclude special tokens

    # Find the position of the word in the context
    start_index = None
    for i, word_id in enumerate(word_ids):
        if word_id is not None and inputs.tokens()[i+1] in word_tokens:
            start_index = i + 1  # +1 to account for the [CLS] token
            break

    if start_index is None:
        raise ValueError(f"Could not find '{word}' or any of its subwords in the context")

    # Get the embedding
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state[0, start_index, :].numpy()

# Test the function
test_word = "house"
test_context = "The house is spacious and comfortable."
try:
    embedding = get_embedding(test_word, test_context)
    print(f"Successfully got embedding for '{test_word}'")
except ValueError as e:
    print(f"Error: {str(e)}")

# Now let's use this function in our main loop
X = []  # Original embeddings
y = []  # Transformed embeddings

for word, context in words_and_contexts:
    if word in ["house", "car", "tree", "book", "computer", "phone", "chair", "table", "picture", "window"]:
        context = context.format(word)
    else:
        context = context.format(word)

    try:
        original_embedding = get_embedding(word, context)
        transformed_context = context_transformation(context, "tense_change")
        transformed_embedding = get_embedding(word, transformed_context)

        X.append(original_embedding)
        y.append(transformed_embedding)
    except ValueError as e:
        print(f"Error processing word '{word}': {str(e)}")
        continue

X = np.array(X)
y = np.array(y)

# Continue with the rest of your script (training models, evaluating, etc.)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Train a neural network model
nn_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000)
nn_model.fit(X_train, y_train)

# Evaluate the models
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    cosine_sims = [np.dot(y[i], y_pred[i]) / (np.linalg.norm(y[i]) * np.linalg.norm(y_pred[i])) for i in range(len(y))]
    mse = mean_squared_error(y, y_pred)
    return np.mean(cosine_sims), mse

print("Linear Model Performance:")
print("Training:", evaluate_model(linear_model, X_train, y_train))
print("Testing:", evaluate_model(linear_model, X_test, y_test))

print("\nNeural Network Model Performance:")
print("Training:", evaluate_model(nn_model, X_train, y_train))
print("Testing:", evaluate_model(nn_model, X_test, y_test))

# Test on new words
new_words_and_contexts = [
    ("jump", "Athletes {} over high bars."),
    ("smell", "The flowers {} wonderful in the garden."),
    ("understand", "Students {} the complex theory."),
    ("believe", "He {} in the power of positive thinking."),
    ("mountain", "The {} is covered in snow year-round.")
]

for new_word, new_context in new_words_and_contexts:
    #new_context = new_context.format("is" if new_word == "mountain" else new_word)
    new_context = new_context.format(new_word)
    new_embedding = get_embedding(new_word, new_context)
    new_transformed_context = context_transformation(new_context, "tense_change")
    actual_transformed_embedding = get_embedding(new_word, new_transformed_context)

    linear_prediction = linear_model.predict([new_embedding])[0]
    nn_prediction = nn_model.predict([new_embedding])[0]

    print(f"\nNew word '{new_word}':")
    print(f"Linear Model Prediction - Cosine Similarity: {np.dot(actual_transformed_embedding, linear_prediction) / (np.linalg.norm(actual_transformed_embedding) * np.linalg.norm(linear_prediction)):.4f}")
    print(f"Neural Network Prediction - Cosine Similarity: {np.dot(actual_transformed_embedding, nn_prediction) / (np.linalg.norm(actual_transformed_embedding) * np.linalg.norm(nn_prediction)):.4f}")

Successfully got embedding for 'house'
Error processing word 'surprised': Could not find 'surprised' or any of its subwords in the context
Linear Model Performance:
Training: (1.0, 8.0217185e-14)
Testing: (0.7558552, 0.10528054)

Neural Network Model Performance:
Training: (0.9871122, 0.005890542)
Testing: (0.6972883, 0.12724763)

New word 'jump':
Linear Model Prediction - Cosine Similarity: 0.6447
Neural Network Prediction - Cosine Similarity: 0.5568

New word 'smell':
Linear Model Prediction - Cosine Similarity: 0.8047
Neural Network Prediction - Cosine Similarity: 0.7472

New word 'understand':
Linear Model Prediction - Cosine Similarity: 0.6854
Neural Network Prediction - Cosine Similarity: 0.6126

New word 'believe':
Linear Model Prediction - Cosine Similarity: 0.6890
Neural Network Prediction - Cosine Similarity: 0.6142

New word 'mountain':
Linear Model Prediction - Cosine Similarity: 0.6510
Neural Network Prediction - Cosine Similarity: 0.5849
