In [1]:
from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import DataFromPrompt, Embed, CosineSimilarity, concat, HFHubDataSource
from datadreamer.embedders import SentenceTransformersEmbedder
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
import math
import re
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with DataDreamer("./output"):
    stel_dataset = HFHubDataSource(
        "Lexical Features",
        path="jjz5463/probing_dataset_5.0",
        split="train"
    )

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Lexical Features' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [8]:
def compute_embeddings(
        dataset_pos, dataset_neg, model: str
):
    with DataDreamer("./output"):
        pos_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Positive Examples",
            inputs = {
                "texts": dataset_pos
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
        neg_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Negative Examples",
            inputs = {
                "texts": dataset_neg
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
    return pos_embedded_data, neg_embedded_data

def convert_embeddings(pos_embedded_data, neg_embedded_data):
    pos_formal_embeddings = np.array(pos_embedded_data.output["embeddings"][0:100])
    neg_formal_embeddings = np.array(neg_embedded_data.output["embeddings"][0:100])
    paired_formal_embeddings = [(pos, neg) for pos, neg in zip(pos_formal_embeddings, neg_formal_embeddings)]
    pos_complex_embeddings = np.array(pos_embedded_data.output["embeddings"][100:200])
    neg_complex_embeddings = np.array(neg_embedded_data.output["embeddings"][100:200])
    paired_complex_embeddings = [(pos, neg) for pos, neg in zip(pos_complex_embeddings, neg_complex_embeddings)]
    pos_contraction_embeddings = np.array(pos_embedded_data.output["embeddings"][200:300])
    neg_contraction_embeddings = np.array(neg_embedded_data.output["embeddings"][200:300])
    paired_contraction_embeddings = [(pos, neg) for pos, neg in zip(pos_contraction_embeddings, neg_contraction_embeddings)]
    pos_number_embeddings = np.array(pos_embedded_data.output["embeddings"][300:400])
    neg_number_embeddings = np.array(neg_embedded_data.output["embeddings"][300:400])
    paired_number_embeddings = [(pos, neg) for pos, neg in zip(pos_number_embeddings, neg_number_embeddings)]
    return paired_formal_embeddings, paired_complex_embeddings, paired_contraction_embeddings, paired_number_embeddings

def compute_accuracy(paired_embeddings: list):
    accuracy = 0
    correct = 0
    rand = 0
    incorrect = 0
    for i in range(len(paired_embeddings)):
        # print(f"A1: {stel_dataset.output['positive'][i]}")
        # print(f"A2: {stel_dataset.output['negative'][i]}")
        anchor_pos, anchor_neg = paired_embeddings[i]
        norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
        # anchor_pos = anchor_pos.reshape(1, -1)
        # anchor_neg = anchor_neg.reshape(1, -1)
        for j in range(i+1, len(paired_embeddings)):
            # print(f"S1 pair: {stel_dataset.output['positive'][j]}")
            # print(f"S2 pair: {stel_dataset.output['negative'][j]}")
            alt_pos, alt_neg = paired_embeddings[j]
            norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
            # alt_pos = alt_pos.reshape(1, -1)
            # alt_neg = alt_neg.reshape(1, -1)
            
            sim1 = np.dot(norm_anchor_pos, norm_alt_pos)
            sim2 = np.dot(norm_anchor_neg, norm_alt_neg)
            sim3 = np.dot(norm_anchor_pos, norm_alt_neg)
            sim4 = np.dot(norm_anchor_neg, norm_alt_pos)
            # print(f"sim(A1, S1): {sim1}")
            # print(f"sim(A2, S2): {sim2}")
            # print(f"sim(A1, S2): {sim3}")
            # print(f"sim(A2, S1): {sim4}")
            # print(f"(1-sim1)^2 + (1-sim2)^2: {math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2)}")
            # print(f"(1-sim3)^2 + (1-sim4)^2: {math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2)}")
            # print(f"Prediction: {math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2)}")
            # sim1 = cosine_similarity(anchor_pos, alt_pos)[0][0]
            # sim2 = cosine_similarity(anchor_neg, alt_neg)[0][0]
            # sim3 = cosine_similarity(anchor_pos, alt_neg)[0][0]
            # sim4 = cosine_similarity(anchor_neg, alt_pos)[0][0]
            if math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) == math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 0.5
                rand += 1
            elif math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 1
                correct += 1
            else:
                accuracy += 0
                incorrect += 1
    return accuracy / (len(paired_embeddings) * (len(paired_embeddings) - 1) / 2)

def STEL_benchmark(dataset_pos, dataset_neg, model):
    pos_embedded_data, neg_embedded_data = compute_embeddings(dataset_pos, dataset_neg, model)
    paired_formal_embeddings, paired_complex_embeddings, paired_contraction_embeddings, paired_number_embeddings = convert_embeddings(pos_embedded_data, neg_embedded_data)
    formal_accuracy = compute_accuracy(paired_formal_embeddings)
    complex_accuracy = compute_accuracy(paired_complex_embeddings)
    contraction_accuracy = compute_accuracy(paired_contraction_embeddings)
    number_accuracy = compute_accuracy(paired_number_embeddings)
    avg_accuracy = (formal_accuracy + complex_accuracy + contraction_accuracy + number_accuracy) / 4
    return formal_accuracy, complex_accuracy, contraction_accuracy, number_accuracy, avg_accuracy

def STEL_print(model):
    formal_accuracy, complex_accuracy, contraction_accuracy, number_accuracy, avg_accuracy = STEL_benchmark(stel_dataset.output["positive"], stel_dataset.output["negative"], model)
    print(f"Formal Accuracy for {model} Embeddings: {formal_accuracy}")
    print(f"Complex Accuracy for {model} Embeddings: {complex_accuracy}")
    print(f"Contraction Accuracy for {model} Embeddings: {contraction_accuracy}")
    print(f"Number Accuracy for {model} Embeddings: {number_accuracy}")
    print(f"Average Accuracy for {model} Embeddings: {avg_accuracy}")

def STEL_table(model):
    formal_accuracy, complex_accuracy, contraction_accuracy, number_accuracy, avg_accuracy = STEL_benchmark(stel_dataset.output["positive"], stel_dataset.output["negative"], model)
    data = {
        'Metric': ['Formal Accuracy', 'Complex Accuracy', 'Contraction Accuracy', 'Number Accuracy', 'Average Accuracy'],
        f'{model} Embeddings': [formal_accuracy, complex_accuracy, contraction_accuracy, number_accuracy, avg_accuracy]
    }
    df = pd.DataFrame(data)
    print(df)


In [9]:
STEL_table("AnnaWegmann/Style-Embedding")
STEL_table("sentence-transformers/all-mpnet-base-v2")
STEL_table("google-bert/bert-base-uncased")
STEL_table("google-bert/bert-base-cased")
STEL_table("google-bert/bert-base-multilingual-cased")

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea

                 Metric  AnnaWegmann/Style-Embedding Embeddings
0       Formal Accuracy                                0.885253
1      Complex Accuracy                                0.721212
2  Contraction Accuracy                                0.983636
3       Number Accuracy                                0.973333
4      Average Accuracy                                0.890859


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-uncased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-uncased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


                 Metric  sentence-transformers/all-mpnet-base-v2 Embeddings
0       Formal Accuracy                                           0.823232 
1      Complex Accuracy                                           0.618384 
2  Contraction Accuracy                                           0.878182 
3       Number Accuracy                                           0.925859 
4      Average Accuracy                                           0.811414 


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-cased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-cased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


                 Metric  google-bert/bert-base-uncased Embeddings
0       Formal Accuracy                                  0.973131
1      Complex Accuracy                                  0.881818
2  Contraction Accuracy                                  0.977980
3       Number Accuracy                                  0.985859
4      Average Accuracy                                  0.954697


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-multilingual-cased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-multilingual-cased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


                 Metric  google-bert/bert-base-cased Embeddings
0       Formal Accuracy                                0.968081
1      Complex Accuracy                                0.852929
2  Contraction Accuracy                                0.986667
3       Number Accuracy                                0.997374
4      Average Accuracy                                0.951263
                 Metric  google-bert/bert-base-multilingual-cased Embeddings
0       Formal Accuracy                                           0.954949  
1      Complex Accuracy                                           0.793333  
2  Contraction Accuracy                                           0.997778  
3       Number Accuracy                                           0.983636  
4      Average Accuracy                                           0.932424  


In [5]:
import STEL.src.eval_style_models
import style_similarity

eval_style_models.eval_sim(style_objects=[style_similarity.WordLengthSimilarity()])

ModuleNotFoundError: No module named 'to_add_const'