In [None]:
import pandas as pd
from openai import OpenAI
from data_loader import get_dataset
from data_loader import BiosDataset, JigsawBias, CustomDataset, CrowSPairs, StereoSet, TwitterAAE, SBICDataset, ImplicitHate, WinoQueer

#import tiktoken
import numpy as np
import os
import pickle
import yaml
import json
from tqdm import tqdm

### (!) Specifiy your API Key

Create a file `api_key.txt` with your OpenAI API Key to run this notebook

In [None]:
with open('../api_key.txt', 'r') as file:
    api_key = file.read().rstrip()

client = OpenAI(api_key=api_key)

In [None]:
embedding_model = "text-embedding-3-large"
embedding_encoding = "cl100k_base"
max_tokens = 800  # the maximum for text-embedding-3-small is 8191
emb_dir = '../../artifacts/embeddings/'

In [None]:
def get_raw_embeddings(texts, save_file, embedding_model):
    if os.path.exists(save_file):
        with open(save_file, 'rb') as handle:
            out_chunks = pickle.load(handle)
    else:
        out_chunks = []
        for i in tqdm(range(0, len(texts), 10)):
            out_chunks.append(client.embeddings.create(input=texts[i:i+10], model=embedding_model))

    return out_chunks


def get_embeddings(texts, save_file_raw, save_file_np, embedding_model):
    if os.path.exists(save_file_np):
        with open(save_file_np, 'rb') as handle:
            emb_dict = pickle.load(handle)
            assert emb_dict['model'] == embedding_model
            embeddings = emb_dict['embeddings']
            assert len(embeddings) == len(texts), ("found %i embeddings for %i texts" % (len(embeddings), len(texts)))
    else:
        out_chunks = get_raw_embeddings(texts, save_file_raw, embedding_model)

        with open(save_file_raw, 'wb') as handle:
            pickle.dump(out_chunks, handle)
            
        # remove chunks from raw embedding list while filling new list (necessary for larger datasets)
        embeddings = []
        while len(out_chunks) > 0:
            chunk = out_chunks.pop(0)
            for elem in chunk.data:
                embeddings.append(elem.embedding)
        
        assert len(embeddings) == len(texts), ("found %i embeddings for %i texts" % (len(embeddings), len(texts)))

        emb_arr = np.asarray(embeddings)
        saved = {'model': embedding_model, 'embeddings': emb_arr}
        
        with open(save_file_np, 'wb') as handle:
            pickle.dump(saved, handle)

    return embeddings


In [None]:
def get_split_emb(data, split, dataset_name, embedding_model):
    print("got %i samples for split %s" % (len(data), split))
    save_file_raw = ('%s/%s_%s_%s_raw_output.pickle' % (emb_dir, dataset_name, split, embedding_model))
    save_file_np = ('%s/%s_%s_%s.pickle' % (emb_dir, dataset_name, split, embedding_model))
    embeddings = get_embeddings(data, save_file_raw, save_file_np, embedding_model)
    
    text_file = ('%s/%s_%s_text_data.pickle' % (emb_dir, dataset_name, split))
    if not os.path.exists(text_file):
        with open(text_file, 'wb') as handle:
            pickle.dump(data, handle)
        
    return embeddings

### Embedd splits of the datasets

In [None]:
dataset_name = 'winoqueer'
dataset = WinoQueer(local_dir='../../data/winoqueer_final.csv')
data, _, lbl, group_lbl, cw, gw = dataset.get_split('test')

emb = {}
emb['test'] = get_split_emb(data, 'test', dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'implicit_hate'
dataset = ImplicitHate(local_dir='../../data/implicit-hate-corpus/')
data, _, lbl, group_lbl, cw, gw = dataset.get_split('test')

emb = {}
emb['test'] = get_split_emb(data, 'test', dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'crows_pairs'
dataset = CrowSPairs()
data, _, lbl, group_lbl, cw, gw = dataset.get_split('test')

emb = {}
emb['test'] = get_split_emb(data, 'test', dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'stereoset'
dataset = StereoSet()
data, _, lbl, group_lbl, cw, gw = dataset.get_split('val')

emb = {}
emb['val'] = get_split_emb(data, 'val', dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)


In [None]:
dataset_name = 'twitterAAE'
dataset = TwitterAAE()
data, _, lbl, group_lbl, cw, gw = dataset.get_split('test')

emb = {}
emb['test'] = get_split_emb(data, 'test', dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'sbic'
local_dir = '../../data/filtered_sbic_minority_overview.csv'
dataset = SBICDataset(local_dir=local_dir)

emb = {}
for split in ['train', 'test', 'dev']:
    data, _, lbl, group_lbl, cw, gw = dataset.get_split(split)
    emb[split] = get_split_emb(data, split, dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'jigsaw'
local_dir = '../../data/jigsaw_bias'
dataset = JigsawBias(local_dir=local_dir, option='single-class')

emb = {}
for split in ['train', 'test', 'dev']:
    data, _, lbl, group_lbl, cw, gw = dataset.get_split(split)
    emb[split] = get_split_emb(data, split, dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'bios-supervised'
local_dir = '../../data/bios_huggingface_merge.pkl'
dataset = BiosDataset(local_dir=local_dir, option='supervised')

emb = {}
for split in ['train', 'test', 'dev']:
    data, _, lbl, group_lbl, cw, gw = dataset.get_split(split)
    emb[split] = get_split_emb(data, split, dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

In [None]:
dataset_name = 'bios-unsupervised'
dataset = BiosDataset(local_dir=local_dir, option='unsupervised')

emb = {}
for split in ['train', 'test', 'dev']:
    data, _, lbl, group_lbl, cw, gw = dataset.get_split(split)
    emb[split] = get_split_emb(data, split, dataset_name, embedding_model)
dataset.set_preprocessed_data(emb)

### Embed training and test splits of the datasets

### Create dictionary with words/phrases used as defining terms in the experiments

In [None]:
dict_empty = '../../artifacts/embeddings/word_phrase_dict_empty.pickle'

with open(dict_empty, 'rb') as handle:
    word_phrase_emb_dict_empty = pickle.load(handle)

In [None]:
len(word_phrase_emb_dict_empty)

In [None]:
# update word phrase dictionary given an experiment config
CONFIG_FILE = 'experiments/configs/new/experiment_config.json'

with open(CONFIG_FILE, 'r') as f:
    config = json.load(f)

eval_setup_file = config['bias_space_eval_config']
with open(eval_setup_file, 'r') as stream:
    eval_setups_by_attr = yaml.safe_load(stream)

In [None]:
for attr, content in eval_setups_by_attr.items():
    assert len(content['defining_terms']) == 1

    for attr, group_term_dict in content['defining_terms'].items():
        for group, terms in group_term_dict.items():
            for term in terms:
                if not term in word_phrase_emb_dict_empty.keys():
                    word_phrase_emb_dict_empty[term] = None

with open(dict_empty, 'wb') as handle:
     pickle.dump(word_phrase_emb_dict_empty, handle)

In [None]:
len(word_phrase_emb_dict_empty)

In [None]:
# update word phrase dict with new defining term config
def_term_confg = 'experiments/configs/new/defining_terms.yaml'

with open(def_term_confg, 'r') as ff:
    def_term_dict = yaml.safe_load(ff)

phrases = []
for attr, terms_per_group in def_term_dict['defining_terms'].items():
    for group, terms in terms_per_group.items():
        phrases += terms

phrases = list(set(phrases))
for phrase in phrases:
    if not phrase in word_phrase_emb_dict_empty.keys():
        word_phrase_emb_dict_empty[phrase] = None

with open(dict_empty, 'wb') as handle:
     pickle.dump(word_phrase_emb_dict_empty, handle)

In [None]:
len(word_phrase_emb_dict_empty)

In [None]:
# load or create new word/phrase dictionary for the current embedding model
dict_emb = ('../../artifacts/embeddings/word_phrase_dict_%s.pickle' % embedding_model)

# load current state of dictionary (if available)
if os.path.exists(dict_emb):
    with open(dict_emb, 'rb') as handle:
        loaded_dict = pickle.load(handle)
        prev_model = loaded_dict['model']
        assert prev_model == embedding_model
        word_phrase_emb_dict = loaded_dict['emb_dict']
else:
    word_phrase_emb_dict = {}


### Embed the terms and phrases from the dictionary

In [None]:
# query word/phrase embedding for current embedding model
save_dict = {'model': embedding_model, 'emb_dict': word_phrase_emb_dict}

for term, emb in word_phrase_emb_dict_empty.items():
    if term in word_phrase_emb_dict.keys() and word_phrase_emb_dict[term] is not None:
        # embedding for this term or phrase already exists
        continue
    else:
        # call api
        print("call api for %s" % term)
        emb = client.embeddings.create(input=[term], model=embedding_model).data[0].embedding
        save_dict['emb_dict'][term] = emb

with open(dict_emb, 'wb') as handle:
    pickle.dump(save_dict, handle)

In [None]:
len(save_dict['emb_dict'])