In [1]:
# | default_exp framedimensions
# | export
from sentence_transformers import SentenceTransformer
import numpy as np
import matplotlib.pyplot as plt


class FramingDimensions:
    def __init__(self, base_model, dimensions, pole_names, normalize=False):
        self.encoder = SentenceTransformer(base_model)
        self.dimensions = dimensions
        self.dim_embs = self.encoder.encode(dimensions, normalize_embeddings=normalize)
        self.pole_names = pole_names
        self.axis_names = list(map(lambda x: x[0] + "/" + x[1], pole_names))
        axis_embs = []
        for pole1, pole2 in pole_names:
            p1 = self.get_dimension_names().index(pole1)
            p2 = self.get_dimension_names().index(pole2)
            axis_emb = self.dim_embs[p1] - self.dim_embs[p2]
            axis_embs.append(axis_emb)
            # print norm of embeddings
            print(f"axis embedding norm: {np.linalg.norm(axis_emb)}")  # not always 1
            print(f"pole 1 embedding norm: {np.linalg.norm(self.dim_embs[p1])}")  # always 1
            print(f"pole 2 embedding norm: {np.linalg.norm(self.dim_embs[p2])}")  # always 1
        self.axis_embs = np.stack(axis_embs)
        self.normalize = normalize

    def get_dimension_names(self):
        dimension_names = [l.split(":")[0].split(" ")[0] for l in self.dimensions]
        return dimension_names

    def __call__(self, sequence_to_align):
        embs = self.encoder.encode(sequence_to_align, normalize_embeddings=self.normalize)
        print(f"text embedding: {np.linalg.norm(embs)}")
        scores = embs @ self.axis_embs.T
        named_scores = dict(zip(self.pole_names, scores.T))
        return named_scores

    def visualize(self, align_scores_df, **kwargs):
        name_left = align_scores_df.columns.map(lambda x: x[1])
        name_right = align_scores_df.columns.map(lambda x: x[0])
        bias = align_scores_df.mean()
        color = ["b" if x > 0 else "r" for x in bias]
        inten = (align_scores_df.var().fillna(0) + 0.001) * 50_000
        bounds = bias.abs().max() * 1.1

        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.scatter(x=bias, y=name_left, s=inten, c=color)
        plt.axvline(0)
        plt.xlim(-bounds, bounds)
        plt.gca().invert_yaxis()
        axi = ax.twinx()
        axi.set_ylim(ax.get_ylim())
        axi.set_yticks(ax.get_yticks(), labels=name_right)
        plt.tight_layout()
        return fig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "all-mpnet-base-v2"
dimensions = [
    "Care: ...acted with kindness, compassion, or empathy, or nurtured another person.",
    "Harm: ...acted with cruelty, or hurt or harmed another person/animal and caused suffering.",
    "Fairness: ...acted in a fair manner, promoting equality, justice, or rights.",
    "Cheating: ...was unfair or cheated, or caused an injustice or engaged in fraud.",
    "Loyalty: ...acted with fidelity, or as a team player, or was loyal or patriotic.",
    "Betrayal: ...acted disloyal, betrayed someone, was disloyal, or was a traitor.",
    "Authority: ...obeyed, or acted with respect for authority or tradition.",
    "Subversion: ...disobeyed or showed disrespect, or engaged in subversion or caused chaos.",
    "Sanctity: ...acted in a way that was wholesome or sacred, or displayed purity or sanctity.",
    "Degredation: ...was depraved, degrading, impure, or unnatural.",
]
pole_names = [
    ("Care", "Harm"),
    ("Fairness", "Cheating"),
    ("Loyalty", "Betrayal"),
    ("Authority", "Subversion"),
    ("Sanctity", "Degredation"),
]
framing_dimen_model = FramingDimensions(base_model, dimensions, pole_names)

axis embedding norm: 1.0412116050720215
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 1.0664228200912476
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 0.889381468296051
pole 1 embedding norm: 1.0
pole 2 embedding norm: 0.9999999403953552
axis embedding norm: 1.1478166580200195
pole 1 embedding norm: 0.9999999403953552
pole 2 embedding norm: 1.0
axis embedding norm: 1.1032711267471313
pole 1 embedding norm: 0.9999999403953552
pole 2 embedding norm: 0.9999999403953552


In [3]:
base_models = ["all-mpnet-base-v2", "paraphrase-MiniLM-L6-v2"]
for model in base_models:
    print(model)
    print("Normalized:")
    normalized_model = FramingDimensions(model, dimensions, pole_names, normalize=True)
    print("Unnormalized:")
    unnormalized_model = FramingDimensions(model, dimensions, pole_names, normalize=False)

all-mpnet-base-v2
Normalized:
axis embedding norm: 1.0412116050720215
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 1.0664228200912476
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 0.889381468296051
pole 1 embedding norm: 1.0
pole 2 embedding norm: 0.9999999403953552
axis embedding norm: 1.1478166580200195
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 1.1032711267471313
pole 1 embedding norm: 1.0
pole 2 embedding norm: 0.9999999403953552
Unnormalized:
axis embedding norm: 1.0412116050720215
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 1.0664228200912476
pole 1 embedding norm: 1.0
pole 2 embedding norm: 1.0
axis embedding norm: 0.889381468296051
pole 1 embedding norm: 1.0
pole 2 embedding norm: 0.9999999403953552
axis embedding norm: 1.1478166580200195
pole 1 embedding norm: 0.9999999403953552
pole 2 embedding norm: 1.0
axis embedding norm: 1.1032711267471313
pole 1 embed

In [4]:
normalized_model("I helped an old lady cross the street.")

text embedding: 1.0


{('Care', 'Harm'): 0.04329085,
 ('Fairness', 'Cheating'): 0.08081094,
 ('Loyalty', 'Betrayal'): -0.08368337,
 ('Authority', 'Subversion'): 0.14771362,
 ('Sanctity', 'Degredation'): -0.055849385}