In [None]:
import pandas as pd
from openai import OpenAI
from pie_data import get_dataset
#import tiktoken
import numpy as np
import os
import pickle
import yaml
import json
from tqdm import tqdm

### (!) Specifiy your API Key

Create a file `api_key.txt` with your OpenAI API Key to run this notebook

In [None]:
with open('api_key.txt', 'r') as file:
    api_key = file.read().rstrip()

client = OpenAI(api_key=api_key)

In [None]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 800  # the maximum for text-embedding-3-small is 8191

In [None]:
def get_raw_embeddings(texts, save_file, embedding_model):
    if os.path.exists(save_file):
        with open(save_file, 'rb') as handle:
            out_chunks = pickle.load(handle)
    else:
        out_chunks = []
        for i in tqdm(range(0, len(texts), 10)):
            out_chunks.append(client.embeddings.create(input=texts[i:i+10], model=embedding_model))

    return out_chunks


def get_embeddings(texts, save_file_raw, save_file_np, embedding_model):
    if os.path.exists(save_file_np):
        with open(save_file_np, 'rb') as handle:
            emb_dict = pickle.load(handle)
            assert emb_dict['model'] == embedding_model
            embeddings = emb_dict['embeddings']
            assert len(embeddings) == len(texts), ("found %i embeddings for %i texts" % (len(embeddings), len(texts)))
    else:
        out_chunks = get_raw_embeddings(texts, save_file_raw, embedding_model)

        with open(save_file_raw, 'wb') as handle:
            pickle.dump(out_chunks, handle)
            
        # remove chunks from raw embedding list while filling new list (necessary for larger datasets)
        embeddings = []
        while len(out_chunks) > 0:
            chunk = out_chunks.pop(0)
            for elem in chunk.data:
                embeddings.append(elem.embedding)
        
        assert len(embeddings) == len(texts), ("found %i embeddings for %i texts" % (len(embeddings), len(texts)))

        emb_arr = np.asarray(embeddings)
        saved = {'model': embedding_model, 'embeddings': emb_arr}
        
        with open(save_file_np, 'wb') as handle:
            pickle.dump(saved, handle)
        
    return embeddings


In [None]:
def get_split_emb(data, split, dataset_name, embedding_model):
    print("got %i samples for split %s" % (len(data), split))
    save_file_raw = ('embeddings/%s_%s_%s_raw_output.pickle' % (dataset_name, split, embedding_model))
    save_file_np = ('embeddings/%s_%s_%s.pickle' % (dataset_name, split, embedding_model))
    embeddings = get_embeddings(data, save_file_raw, save_file_np, embedding_model)
    return embeddings

### Embed training and test splits of the datasets

In [None]:
# bios supervised
dataset_name = 'bios-supervised'
bios_dir = '../../data/bios_huggingface_merge.pkl'
X_train, y_train, X_test, y_test, n_classes, multi_label, class_weights, protected_attr_dict = get_dataset(dataset_name, local_dir=bios_dir)

emb = {}
emb['train'] = get_split_emb(X_train, 'train', dataset_name, embedding_model)
emb['test'] = get_split_emb(X_test, 'test', dataset_name, embedding_model)

In [None]:
# twitterAAE
dataset_name = 'twitterAAE'
X_train, y_train, X_test, y_test, n_classes, multi_label, class_weights, protected_attr_dict = get_dataset(dataset_name)

emb = {}
emb['test'] = get_split_emb(X_test, 'test', dataset_name, embedding_model)

In [None]:
# crowspairs
dataset_name = 'crows_pairs'
X_train, y_train, X_test, y_test, n_classes, multi_label, class_weights, protected_attr_dict = get_dataset(dataset_name)

emb = {}
emb['test'] = get_split_emb(X_test, 'test', dataset_name, embedding_model)

In [None]:
# bios unsupervised
dataset_name = 'bios-unsupervised'
X_train, y_train, X_test, y_test, n_classes, multi_label, class_weights, protected_attr_dict = get_dataset(dataset_name)

emb = {}
emb['train'] = get_split_emb(X_train, 'train', dataset_name, embedding_model)
emb['test'] = get_split_emb(X_test, 'test', dataset_name, embedding_model)

In [None]:
# jigsaw
dataset_name = 'jigsaw'
local_dir = '../../data/jigsaw_bias'
X_train, y_train, X_test, y_test, n_classes, multi_label, class_weights, protected_attr_dict = get_dataset(dataset_name, local_dir)

emb = {}
emb['train'] = get_split_emb(X_train, 'train', dataset_name, embedding_model)
emb['test'] = get_split_emb(X_test, 'test', dataset_name, embedding_model)

### Create dictionary with words/phrases used as defining terms in the experiments

In [None]:
dict_empty = 'embeddings/word_phrase_dict_empty.pickle'

with open(dict_empty, 'rb') as handle:
    word_phrase_emb_dict_empty = pickle.load(handle)

In [None]:
# update word phrase dictionary given an experiment config
CONFIG_FILE = 'experiments/configs/esann25/experiment_config.json'

with open(CONFIG_FILE, 'r') as f:
    config = json.load(f)

eval_setup_file = config['bias_space_eval_config']
with open(eval_setup_file, 'r') as stream:
    eval_setups_by_attr = yaml.safe_load(stream)

In [None]:
len(word_phrase_emb_dict_empty)

In [None]:
for attr, content in eval_setups_by_attr.items():
    assert len(content['defining_terms']) == 1

    for group_terms in content['defining_terms'][0]:
        for term in group_terms:
            if not term in word_phrase_emb_dict_empty.keys():
                word_phrase_emb_dict_empty[term] = None

with open(dict_empty, 'wb') as handle:
     pickle.dump(word_phrase_emb_dict_empty, handle)

In [None]:
len(word_phrase_emb_dict_empty)

In [None]:
# load or create new word/phrase dictionary for the current embedding model
dict_emb = ('embeddings/word_phrase_dict_%s.pickle' % embedding_model)

# load current state of dictionary (if available)
if os.path.exists(dict_emb):
    with open(dict_emb, 'rb') as handle:
        loaded_dict = pickle.load(handle)
        prev_model = loaded_dict['model']
        assert prev_model == embedding_model
        word_phrase_emb_dict = loaded_dict['emb_dict']
else:
    word_phrase_emb_dict = {}


### Embed the terms and phrases from the dictionary

In [None]:
# query word/phrase embedding for current embedding model
save_dict = {'model': embedding_model, 'emb_dict': word_phrase_emb_dict}

for term, emb in word_phrase_emb_dict_empty.items():
    if term in word_phrase_emb_dict.keys() and word_phrase_emb_dict[term] is not None:
        # embedding for this term or phrase already exists
        continue
    else:
        # call api
        print("call api for %s" % term)
        emb = client.embeddings.create(input=[term], model=embedding_model).data[0].embedding
        save_dict['emb_dict'][term] = emb

with open(dict_emb, 'wb') as handle:
    pickle.dump(save_dict, handle)

In [None]:
len(save_dict['emb_dict'])