In [2]:
from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import DataFromPrompt, Embed, CosineSimilarity, concat, HFHubDataSource
from datadreamer.embedders import SentenceTransformersEmbedder
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
import math
import re
import pandas as pd
import os
import tabulate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with DataDreamer("./output"):
    stel_dataset = HFHubDataSource(
        "Lexical Features",
        path="jjz5463/full_set_features_2.0",
        split="train"
    )

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Lexical Features' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


In [4]:
def compute_embeddings(
        dataset_pos, dataset_neg, model: str
):
    with DataDreamer("./output"):
        pos_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Positive Examples",
            inputs = {
                "texts": dataset_pos
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
        neg_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Negative Examples",
            inputs = {
                "texts": dataset_neg
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
    return pos_embedded_data, neg_embedded_data

def convert_embeddings(pos_embedded_data, neg_embedded_data):
    paired_embeddings = []
    for i in range(len(pos_embedded_data.output) // 100):
        pos_embeddings = np.array(pos_embedded_data.output["embeddings"][i * 100 : (i+1) * 100])
        neg_embeddings = np.array(neg_embedded_data.output["embeddings"][i * 100 : (i+1) * 100])
        paired = [(pos, neg) for pos, neg in zip(pos_embeddings, neg_embeddings)]
        paired_embeddings.append(paired)
    return paired_embeddings

def compute_accuracy(paired_embeddings: list):
    accuracy = 0
    correct = 0
    rand = 0
    incorrect = 0
    for i in range(len(paired_embeddings)):
        anchor_pos, anchor_neg = paired_embeddings[i]
        norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
        for j in range(i+1, len(paired_embeddings)):
            alt_pos, alt_neg = paired_embeddings[j]
            norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
            sim1 = np.dot(norm_anchor_pos, norm_alt_pos)
            sim2 = np.dot(norm_anchor_neg, norm_alt_neg)
            sim3 = np.dot(norm_anchor_pos, norm_alt_neg)
            sim4 = np.dot(norm_anchor_neg, norm_alt_pos)
            if math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) == math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 0.5
                rand += 1
            elif math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 1
                correct += 1
            else:
                accuracy += 0
                incorrect += 1
    return accuracy / (len(paired_embeddings) * (len(paired_embeddings) - 1) / 2)

def STEL_benchmark(dataset_pos, dataset_neg, model):
    pos_embedded_data, neg_embedded_data = compute_embeddings(dataset_pos, dataset_neg, model)
    paired_embeddings = convert_embeddings(pos_embedded_data, neg_embedded_data)
    accuracies = []
    for paired in paired_embeddings:
        accuracies.append(compute_accuracy(paired))
    avg_accuracy = np.mean(accuracies)
    return accuracies, avg_accuracy

def STEL_categories():
    categories = []
    for i in range(len(stel_dataset.output) // 100):
        categories.append(stel_dataset.output['feature'][i * 100])
    return categories

def STEL_table(model):
    accuracies, avg_accuracy = STEL_benchmark(stel_dataset.output["positive"], stel_dataset.output["negative"], model)
    accuracies.append(avg_accuracy)
    categories = STEL_categories()
    categories.append('average')
    data = {
        'Metric': categories,
        f'{model} Embeddings': accuracies
    }
    df = pd.DataFrame(data)
    print(df.to_markdown())


In [5]:
STEL_table("AnnaWegmann/Style-Embedding")
STEL_table("sentence-transformers/all-mpnet-base-v2")
STEL_table("google-bert/bert-base-uncased")
STEL_table("google-bert/bert-base-cased")
STEL_table("google-bert/bert-base-multilingual-cased")

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea

|    | Metric                                                 |   AnnaWegmann/Style-Embedding Embeddings |
|---:|:-------------------------------------------------------|-----------------------------------------:|
|  0 | Polite / Impolite                                      |                                 0.698586 |
|  1 | With Humor / Without Humor                             |                                 0.727879 |
|  2 | With sarcasm / Without sarcasm                         |                                 0.763232 |
|  3 | With metaphor / Without metaphor                       |                                 0.624646 |
|  4 | Offensive / Non-Offensive                              |                                 0.807879 |
|  5 | Positive / Negative                                    |                                 0.545051 |
|  6 | Active / Passive                                       |                                 0.643232 |
|  7 | Certain / Uncertain           

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-uncased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-uncased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


|    | Metric                                                 |   sentence-transformers/all-mpnet-base-v2 Embeddings |
|---:|:-------------------------------------------------------|-----------------------------------------------------:|
|  0 | Polite / Impolite                                      |                                             0.853333 |
|  1 | With Humor / Without Humor                             |                                             0.746061 |
|  2 | With sarcasm / Without sarcasm                         |                                             0.718182 |
|  3 | With metaphor / Without metaphor                       |                                             0.814949 |
|  4 | Offensive / Non-Offensive                              |                                             0.913333 |
|  5 | Positive / Negative                                    |                                             0.967677 |
|  6 | Active / Passive                         

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-cased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-cased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


|    | Metric                                                 |   google-bert/bert-base-uncased Embeddings |
|---:|:-------------------------------------------------------|-------------------------------------------:|
|  0 | Polite / Impolite                                      |                                   0.928889 |
|  1 | With Humor / Without Humor                             |                                   0.885657 |
|  2 | With sarcasm / Without sarcasm                         |                                   0.828283 |
|  3 | With metaphor / Without metaphor                       |                                   0.893333 |
|  4 | Offensive / Non-Offensive                              |                                   0.926869 |
|  5 | Positive / Negative                                    |                                   0.899394 |
|  6 | Active / Passive                                       |                                   0.871313 |
|  7 | Certain / Un

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-multilingual-cased Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'google-bert bert-base-multilingual-cased Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


|    | Metric                                                 |   google-bert/bert-base-cased Embeddings |
|---:|:-------------------------------------------------------|-----------------------------------------:|
|  0 | Polite / Impolite                                      |                                 0.941818 |
|  1 | With Humor / Without Humor                             |                                 0.881414 |
|  2 | With sarcasm / Without sarcasm                         |                                 0.78101  |
|  3 | With metaphor / Without metaphor                       |                                 0.881818 |
|  4 | Offensive / Non-Offensive                              |                                 0.938586 |
|  5 | Positive / Negative                                    |                                 0.891515 |
|  6 | Active / Passive                                       |                                 0.890707 |
|  7 | Certain / Uncertain           