In [5]:
from datadreamer import DataDreamer
from datadreamer.llms import OpenAI
from datadreamer.steps import DataFromPrompt, Embed, CosineSimilarity, concat, HFHubDataSource
from datadreamer.embedders import SentenceTransformersEmbedder
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
import math
import re
import pandas as pd
import os
import tabulate

In [6]:
with DataDreamer("./output"):
    stel_dataset = HFHubDataSource(
        "Lexical Features",
        path="jjz5463/full_set_features_2.0",
        split="train"
    )

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'Lexical Features' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output


In [7]:
def compute_embeddings(
        dataset_pos, dataset_neg, model: str
):
    with DataDreamer("./output"):
        pos_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Positive Examples",
            inputs = {
                "texts": dataset_pos
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
        neg_embedded_data = Embed(
            name = f"{model.replace('/', ' ')} Embeddings for Negative Examples",
            inputs = {
                "texts": dataset_neg
            },
            args = {
                "embedder": SentenceTransformersEmbedder(
                    model_name=model
                ),
                "truncate": True
            },
            outputs = {
                "texts": "sentences",
                "embeddings": "embeddings"
            },
        )
    return pos_embedded_data, neg_embedded_data

def convert_embeddings(pos_embedded_data, neg_embedded_data):
    paired_embeddings = []
    for i in range(len(pos_embedded_data.output) // 100):
        pos_embeddings = np.array(pos_embedded_data.output["embeddings"][i * 100 : (i+1) * 100])
        neg_embeddings = np.array(neg_embedded_data.output["embeddings"][i * 100 : (i+1) * 100])
        paired = [(pos, neg) for pos, neg in zip(pos_embeddings, neg_embeddings)]
        paired_embeddings.append(paired)
    return paired_embeddings

def compute_accuracy_STEL(paired_embeddings: list):
    accuracy = 0
    correct = 0
    rand = 0
    incorrect = 0
    for i in range(len(paired_embeddings)):
        anchor_pos, anchor_neg = paired_embeddings[i]
        norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
        for j in range(i+1, len(paired_embeddings)):
            alt_pos, alt_neg = paired_embeddings[j]
            norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
            sim1 = np.dot(norm_anchor_pos, norm_alt_pos)
            sim2 = np.dot(norm_anchor_neg, norm_alt_neg)
            sim3 = np.dot(norm_anchor_pos, norm_alt_neg)
            sim4 = np.dot(norm_anchor_neg, norm_alt_pos)
            if math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) == math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 0.5
                rand += 1
            elif math.pow(1 - sim1, 2) + math.pow(1 - sim2, 2) < math.pow(1 - sim3, 2) + math.pow(1 - sim4, 2):
                accuracy += 1
                correct += 1
            else:
                accuracy += 0
                incorrect += 1
    return accuracy / (len(paired_embeddings) * (len(paired_embeddings) - 1) / 2)

def compute_accuracy_STEL_or_content(paired_embeddings: list):
    accuracy = 0
    correct = 0
    rand = 0
    incorrect = 0
    for i in range(len(paired_embeddings)):
        anchor_pos, anchor_neg = paired_embeddings[i]
        norm_anchor_pos, norm_anchor_neg = anchor_pos / np.linalg.norm(anchor_pos), anchor_neg / np.linalg.norm(anchor_neg)
        for j in range(i+1, len(paired_embeddings)):
            alt_pos, alt_neg = paired_embeddings[j]
            norm_alt_pos, norm_alt_neg = alt_pos / np.linalg.norm(alt_pos), alt_neg / np.linalg.norm(alt_neg)
            norm_alt_neg = norm_anchor_neg
            sim1 = np.dot(norm_anchor_pos, norm_alt_pos)
            sim2 = np.dot(norm_anchor_pos, norm_alt_neg)
            if sim1 == sim2:
                accuracy += 0.5
                rand += 1
            elif sim1 > sim2:
                accuracy += 1
                correct += 1
            else:
                accuracy += 0
                incorrect += 1
    return accuracy / (len(paired_embeddings) * (len(paired_embeddings) - 1) / 2)

def STEL_benchmark(dataset_pos, dataset_neg, model, type='STEL'):
    pos_embedded_data, neg_embedded_data = compute_embeddings(dataset_pos, dataset_neg, model)
    paired_embeddings = convert_embeddings(pos_embedded_data, neg_embedded_data)
    accuracies = []
    for paired in paired_embeddings:
        if type == 'STEL':
            accuracies.append(compute_accuracy_STEL(paired))
        elif type == 'STEL-or-content':
            accuracies.append(compute_accuracy_STEL_or_content(paired))
    avg_accuracy = np.mean(accuracies)
    return accuracies, avg_accuracy
    

def STEL_categories():
    categories = []
    for i in range(len(stel_dataset.output) // 100):
        categories.append(stel_dataset.output['feature'][i * 100])
    return categories

def STEL_table(model, type='STEL'):
    accuracies, avg_accuracy = STEL_benchmark(stel_dataset.output["positive"], stel_dataset.output["negative"], model, type)
    accuracies.append(avg_accuracy)
    categories = STEL_categories()
    categories.append('average')
    data = {
        'Metric': categories,
        f'{model} Embeddings': accuracies
    }
    df = pd.DataFrame(data)
    return df

def merge_dfs(dfs):
    for df in dfs:
        df.set_index('Metric', inplace=True)
    merged_df = pd.concat(dfs, axis=1)
    return merged_df


In [9]:
wegmann_table = STEL_table("AnnaWegmann/Style-Embedding", type = 'STEL-or-content')
mpnet_base_table = STEL_table("sentence-transformers/all-mpnet-base-v2", type = 'STEL-or-content')
bert_base_uncased_table = STEL_table("google-bert/bert-base-uncased", type = 'STEL-or-content')
bert_base_cased_table = STEL_table("google-bert/bert-base-cased", type = 'STEL-or-content')
bert_base_multilingual_table = STEL_table("google-bert/bert-base-multilingual-cased", type = 'STEL-or-content')
distilbert_base_multilingual_table = STEL_table("distilbert/distilbert-base-multilingual-cased", type = 'STEL-or-content')
dfs = [wegmann_table, mpnet_base_table, bert_base_uncased_table, bert_base_cased_table, bert_base_multilingual_table, distilbert_base_multilingual_table]
merged_dfs = merge_dfs(dfs)
print(merged_dfs.to_markdown())

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'AnnaWegmann Style-Embedding Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Done. ✨ Results in folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: ./output
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Positive Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'sentence-transformers all-mpnet-base-v2 Embeddings for Negative Examples' results loaded from disk. 🙌 It was previously run and saved.
[ [35m🤖 Data[33mDr[31mea

| Metric                                                 |   AnnaWegmann/Style-Embedding Embeddings |   sentence-transformers/all-mpnet-base-v2 Embeddings |   google-bert/bert-base-uncased Embeddings |   google-bert/bert-base-cased Embeddings |   google-bert/bert-base-multilingual-cased Embeddings |   distilbert/distilbert-base-multilingual-cased Embeddings |
|:-------------------------------------------------------|-----------------------------------------:|-----------------------------------------------------:|-------------------------------------------:|-----------------------------------------:|------------------------------------------------------:|-----------------------------------------------------------:|
| Polite / Impolite                                      |                                0.234949  |                                           0.00040404 |                                0.0294949   |                               0.0529293  |                                