In [1]:
from datasets import load_dataset
import torch 
from random import randint, seed
import numpy as np
from transformers import AutoModelForCausalLM,AutoTokenizer, pipeline
import evaluate
import matplotlib.pyplot as plt
import gc 
from tqdm.notebook import tqdm

In [2]:
def sample_prompts(dataset, num_examples=100):
    if num_examples >= len(dataset):
        num_examples=len(dataset)
    picks = []
    pick_indice=[]
    for _ in range(num_examples):
        pick = randint(0, len(dataset)-1)
        while pick in picks:
            pick = randint(0, len(dataset)-1)
        pick_indice.append(pick)
        picks.append(dataset[pick])
    return picks

In [3]:
toxicity_dataset=load_dataset("allenai/real-toxicity-prompts", split="train")
regard_dataset=load_dataset("AlexaAI/bold",split="train")
honest_dataset=load_dataset("MilaNLProc/honest","en_queer_nonqueer",split="honest")

regard=evaluate.load("regard","compare")
toxicity = evaluate.load("toxicity")
honest=evaluate.load("honest","en")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.8k [00:00<?, ?B/s]

Generating honest split: 0 examples [00:00, ? examples/s]

Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint


In [4]:
female_bold=[b["prompts"][0] for b in regard_dataset if b["category"]=="American_actresses"]
male_bold=[b["prompts"][0] for b in regard_dataset if b["category"]=="American_actors"]
honest_queer=[h["template_masked"].replace(' [M].',"") for h in honest_dataset if h['category'].startswith('queer')]
honest_nonqueer=[h["template_masked"].replace(' [M].',"") for h in honest_dataset if h['category'].startswith('nonqueer')]

In [None]:
seed(1000)

test_sample= sample_prompts(toxicity_dataset,200)
test_prompts = [p['prompt']['text'] for p in test_sample]

seed(2000)

female_sample=sample_prompts(female_bold,100)
male_sample=sample_prompts(male_bold,100)

seed(3000)

queer_sample=sample_prompts(honest_queer,75)
nonqueer_sample=sample_prompts(honest_nonqueer,75)

# Mixtral 7B

In [4]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", torch_dtype=torch.float16, trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
def mixtral_generate(prompts,honest):
    outputs=[]
    for p in tqdm(prompts,total=len(prompts),desc="Generating Texts"):
        inputs=tokenizer(p, return_tensors="pt").to("cuda")
        if(honest):
            output = model.generate(**inputs, max_length=len(tokenizer(p)['input_ids'])+10,num_return_sequences=20, pad_token_id=tokenizer.eos_token_id).to("cuda")
            generation=tokenizer.batch_decode(output[:3], skip_special_tokens=True)
            generation=[g.replace(p,"") for g in generation]
        else:
            output = model.generate(**inputs, max_new_tokens=40, pad_token_id=tokenizer.eos_token_id).to("cuda")
            generation=tokenizer.decode(output[0], skip_special_tokens=True).replace(p,"")
        outputs.append(generation)
    return outputs

## Mixtral 7B Toxicity

In [38]:
model_continuations=mixtral_generate(test_prompts,False)

Generating Texts:   0%|          | 0/200 [00:00<?, ?it/s]

In [40]:
mixtral_toxicty_rate=toxicity.compute(predictions=model_continuations, aggregation="ratio")["toxicity_ratio"]
mixtral_toxicty=toxicity.compute(predictions=model_continuations)["toxicity"]

## Mixtral 7B Regard

In [42]:
female_continuations=mixtral_generate(female_sample,False)
male_continuations=mixtral_generate(male_sample,False)

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

In [43]:
Mixtral_regard_diff=regard.compute(data=male_continuations,references=female_continuations)["regard_difference"]

## Mixtral 7B Honest

Mixtral is not suitable to output "num_return_sequences" other than 1, making it unable to output several potential outputs for one single prompts. Since Honest expect a list of potential outputs for one single prompt, Mixtral is incompatible with Honest measurement.

In [75]:
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache() 

# Phi-2

In [76]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [89]:
def phi_generate(prompts,honest):
    outputs=[]
    for p in tqdm(prompts,total=len(prompts),desc="Generating Texts"):
        inputs=tokenizer(p, return_tensors="pt", return_attention_mask=False).to("cuda")
        if(honest):
            output = model.generate(**inputs, max_length=len(tokenizer(p)['input_ids'])+10,num_return_sequences=20, pad_token_id=tokenizer.eos_token_id).to("cuda")
            generation=generation[0]['generated_text'].replace(p,'')
        else:
            output = model.generate(**inputs, max_new_tokens=40, pad_token_id=tokenizer.eos_token_id).to("cuda")
            generation=tokenizer.decode(output[0]).replace(p,"")
        outputs.append(generation)
    return outputs
    outputs=[]

## Phi-2 Toxicity

In [90]:
model_continuations=phi_generate(test_prompts,False)

Generating Texts:   0%|          | 0/200 [00:00<?, ?it/s]

In [91]:
phi_toxicty_rate=toxicity.compute(predictions=model_continuations, aggregation="ratio")["toxicity_ratio"]
phi_toxicty=toxicity.compute(predictions=model_continuations)["toxicity"]

## Phi-2 Regard

In [95]:
female_continuations=phi_generate(female_sample,False)
male_continuations=phi_generate(male_sample,False)

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

In [96]:
Phi_regard_diff=regard.compute(data=male_continuations,references=female_continuations)["regard_difference"]

## Phi Honest

Phi-2 is not suitable to output "num_return_sequences" other than 1, making it unable to output several potential outputs for one single prompts. Since Honest expect a list of potential outputs for one single prompt, Phi-2 is incompatible with Honest measurement.

In [99]:
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache() 

# LLaMa Pro 8B

In [8]:
model = AutoModelForCausalLM.from_pretrained("TencentARC/LLaMA-Pro-8B-Instruct", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("TencentARC/LLaMA-Pro-8B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## LLaMa Pro Toxicity

In [9]:
model_continuations=mixtral_generate(test_prompts,False)

Generating Texts:   0%|          | 0/200 [00:00<?, ?it/s]

In [10]:
LLaMa_toxicty_rate=toxicity.compute(predictions=model_continuations, aggregation="ratio")["toxicity_ratio"]
LLaMa_toxicty=toxicity.compute(predictions=model_continuations)["toxicity"]

## LLaMa Regard

In [11]:
female_continuations=mixtral_generate(female_sample,False)
male_continuations=mixtral_generate(male_sample,False)

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

Generating Texts:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
LLaMa_regard_diff=regard.compute(data=male_continuations,references=female_continuations)["regard_difference"]

## LLaMa Honest

In [14]:
queer_continuations=mixtral_generate(queer_sample,True)
nonquer_continuations=mixtral_generate(nonqueer_sample,True)

Generating Texts:   0%|          | 0/75 [00:00<?, ?it/s]

Generating Texts:   0%|          | 0/75 [00:00<?, ?it/s]

In [16]:
groups=["queer"]*75+["nonqueer"]*75
continuations=[q for q in queer_continuations]+[nq for nq in nonquer_continuations]

LLaMa_honest=honest.compute(predictions=continuations,groups=groups)["honest_score_per_group"]

In [17]:
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache() 

In [18]:
with open("LLaMa_toxicty_rate.txt","a") as fq:
    fq.write(str(LLaMa_toxicty_rate))

with open("LLaMa_toxicty.txt","a") as fq:
    fq.write(str(LLaMa_toxicty))
    
with open("LLaMa_regard_diff.txt","a") as fq:
    fq.write(str(LLaMa_regard_diff))
    
with open("LLaMa_toxicty_rate_honest.txt","a") as fq:
    fq.write(str(LLaMa_toxicty_rate_honest))