In [1]:
from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import DataFromPrompt, Embed, CosineSimilarity, concat, HFHubDataSource
from datadreamer.embedders import SentenceTransformersEmbedder
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
import math
# from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with DataDreamer("./output"):
    stel_dataset = HFHubDataSource(
        "Lexical Features",
        path="jjz5463/probing_dataset_4.0",
        split="train"
    )


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Lexical Features' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
print(stel_dataset.output[0])

{'attributes': {'length': '10-20 words', 'point_of_view': 'second-person', 'sentence_type': 'Exclamation', 'tense': 'past', 'topic': 'Cloning a drive with CCC in block copy mode', 'voice': 'active voice'}, 'feature': 'formal', 'positive': 'You skillfully cloned a drive using CCC in block copy mode!', 'negative': 'Dude, you totally smashed cloning that drive with CCC in block copy mode!'}


In [4]:
with DataDreamer("./output"):
    wegmann_pos_embedded_data = Embed(
        name = "Wegmann Embeddings for Positive Examples",
        inputs = {
            "texts": stel_dataset.output["positive"]
        },
        args = {
            "embedder": SentenceTransformersEmbedder(
                model_name="AnnaWegmann/Style-Embedding"
            ),
            "truncate": True
        },
        outputs = {
            "texts": "sentences",
            "embeddings": "embeddings"
        },
    )
    wegmann_neg_embedded_data = Embed(
        name = "Wegmann Embeddings for Negative Examples",
        inputs = {
            "texts": stel_dataset.output["negative"]
        },
        args = {
            "embedder": SentenceTransformersEmbedder(
                model_name="AnnaWegmann/Style-Embedding"
            ),
            "truncate": True
        },
        outputs = {
            "texts": "sentences",
            "embeddings": "embeddings"
        },
    )
    bert_pos_embedded_data = Embed(
        name = "BERT Feature Embeddings for Positive Examples",
        inputs = {
            "texts": stel_dataset.output["positive"]
        },
        args = {
            "embedder": SentenceTransformersEmbedder(
                model_name="sentence-transformers/all-mpnet-base-v2"
            ),
            "truncate": True
        },
        outputs = {
            "texts": "sentences",
            "embeddings": "embeddings"
        },
    )
    bert_neg_embedded_data = Embed(
        name = "BERT Feature Embeddings for Negative Examples",
        inputs = {
            "texts": stel_dataset.output["negative"]
        },
        args = {
            "embedder": SentenceTransformersEmbedder(
                model_name="sentence-transformers/all-mpnet-base-v2"
            ),
            "truncate": True
        },
        outputs = {
            "texts": "sentences",
            "embeddings": "embeddings"
        },
    )

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Wegmann Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Wegmann Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'BERT Feature Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'BERT Feature Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


In [5]:
pos_formal_embeddings = np.array(wegmann_pos_embedded_data.output["embeddings"][0:100])
neg_formal_embeddings = np.array(wegmann_neg_embedded_data.output["embeddings"][0:100])
paired_formal_embeddings = [(pos, neg) for pos, neg in zip(pos_formal_embeddings, neg_formal_embeddings)]
pos_complex_embeddings = np.array(wegmann_pos_embedded_data.output["embeddings"][100:200])
neg_complex_embeddings = np.array(wegmann_neg_embedded_data.output["embeddings"][100:200])
paired_complex_embeddings = [(pos, neg) for pos, neg in zip(pos_complex_embeddings, neg_complex_embeddings)]
pos_contraction_embeddings = np.array(wegmann_pos_embedded_data.output["embeddings"][200:300])
neg_contraction_embeddings = np.array(wegmann_neg_embedded_data.output["embeddings"][200:300])
paired_contraction_embeddings = [(pos, neg) for pos, neg in zip(pos_contraction_embeddings, neg_contraction_embeddings)]
pos_number_embeddings = np.array(wegmann_pos_embedded_data.output["embeddings"][300:400])
neg_number_embeddings = np.array(wegmann_neg_embedded_data.output["embeddings"][300:400])
paired_number_embeddings = [(pos, neg) for pos, neg in zip(pos_number_embeddings, neg_number_embeddings)]

In [10]:
def compute_accuracy(
        paired_embeddings: list,
):
    accuracy = 0
    correct = 0
    rand = 0
    incorrect = 0
    for i in range(len(paired_embeddings)):
        anchor_pos, anchor_neg = paired_embeddings[i]
        norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
        # anchor_pos = anchor_pos.reshape(1, -1)
        # anchor_neg = anchor_neg.reshape(1, -1)
        for j in range(i+1, len(paired_embeddings)):
            alt_pos, alt_neg = paired_embeddings[j]
            norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
            # alt_pos = alt_pos.reshape(1, -1)
            # alt_neg = alt_neg.reshape(1, -1)
            
            sim1 = np.dot(norm_anchor_pos, norm_alt_pos)
            sim2 = np.dot(norm_anchor_neg, norm_alt_neg)
            sim3 = np.dot(norm_anchor_pos, norm_alt_neg)
            sim4 = np.dot(norm_anchor_neg, norm_alt_pos)
            # sim1 = cosine_similarity(anchor_pos, alt_pos)[0][0]
            # sim2 = cosine_similarity(anchor_neg, alt_neg)[0][0]
            # sim3 = cosine_similarity(anchor_pos, alt_neg)[0][0]
            # sim4 = cosine_similarity(anchor_neg, alt_pos)[0][0]
            if math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) == math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 0.5
                rand += 1
            elif math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 1
                correct += 1
            else:
                accuracy += 0
                incorrect += 1
    return accuracy / (len(paired_embeddings) * (len(paired_embeddings) - 1) / 2)

Formal embeddings accuracy with Wegmann: 0.9105050505050505
4507 0 443


In [None]:
formal_accuracy = 0
correct = 0
rand = 0
incorrect = 0
# max_val = float('-inf')
# min_val = float('inf')
for i in range(len(paired_formal_embeddings)):
    anchor_pos, anchor_neg = paired_formal_embeddings[i]
    norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
    # anchor_pos = anchor_pos.reshape(1, -1)
    # anchor_neg = anchor_neg.reshape(1, -1)
    for j in range(i+1, len(paired_formal_embeddings)):
        alt_pos, alt_neg = paired_formal_embeddings[j]
        norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
        # alt_pos = alt_pos.reshape(1, -1)
        # alt_neg = alt_neg.reshape(1, -1)
        
        sim1 = np.dot(anchor_pos, alt_pos)
        sim2 = np.dot(anchor_neg, alt_neg)
        sim3 = np.dot(anchor_pos, alt_neg)
        sim4 = np.dot(anchor_neg, alt_pos)
        # print(sim1, sim2, sim3, sim4)
        # max_val = max(max_val, sim1, sim2, sim3, sim4)
        # min_val = min(min_val, sim1, sim2, sim3, sim4)
        # sim1 = cosine_similarity(anchor_pos, alt_pos)[0][0]
        # sim2 = cosine_similarity(anchor_neg, alt_neg)[0][0]
        # sim3 = cosine_similarity(anchor_pos, alt_neg)[0][0]
        # sim4 = cosine_similarity(anchor_neg, alt_pos)[0][0]
        if math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) == math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
            formal_accuracy += 0.5
            rand += 1
        elif math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
            formal_accuracy += 1
            correct += 1
        else:
            formal_accuracy += 0
            incorrect += 1
print(f"Formal embeddings accuracy with Wegmann: {formal_accuracy / (len(paired_formal_embeddings) * (len(paired_formal_embeddings) - 1) / 2)}")
# print(max_val, min_val)
print(correct, rand, incorrect)