# 1. Import the library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BertTokenizer, BertForMaskedLM, pipeline

from transformers import set_seed

set_seed(10)



import gc
gc.collect()
torch.cuda.empty_cache()

2024-05-30 12:18:43.216194: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 12:18:43.216246: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 12:18:43.216270: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-30 12:18:43.225353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_model_and_tokenizer(model_name, cache_dir):
    """
    Load the model and tokenizer based on the provided model name.

    Args:
    model_name (str): Name of the model to be loaded.
    cache_dir (str): Directory to cache the model and tokenizer.

    Returns:
    model (AutoModelForCausalLM): Loaded language model.
    tokenizer (AutoTokenizer): Loaded tokenizer.
    """
    if model_name in ["tiiuae/falcon-7b", "gpt2"]:
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
    elif model_name == "/network/weights/llama.var/llama2/Llama-2-7b-hf":
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
    elif model_name == "bert-base-uncased":
        tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
        model = BertForMaskedLM.from_pretrained(model_name, cache_dir=cache_dir)
    else:
        raise ValueError("Unsupported model name")
    
    return model, tokenizer

In [4]:
def calculate_joint_sentence_probability(sentence, model, tokenizer, device):
    """
    Calculate the joint probability of a sentence using a GPT-based language model.

    Args:
    sentence (str): Input sentence.
    model (AutoModelForCausalLM): Loaded language model.
    tokenizer (AutoTokenizer): Loaded tokenizer.
    device (torch.device): Device to run the model on (CPU or GPU).

    Returns:
    joint_probability (float): Calculated joint probability of the sentence.
    """
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    log_probs = F.log_softmax(outputs.logits, dim=-1).to(device)
    probabilities = torch.exp(log_probs)
    
    actual_token_probabilities = torch.gather(
        probabilities, 2, inputs['input_ids'][:, 1:].unsqueeze(-1).to(device)
    ).squeeze(-1)

    joint_log_probability = actual_token_probabilities.log().sum().item()
    joint_probability = torch.exp(torch.tensor(joint_log_probability)).item()
    
    return joint_probability

In [5]:

def calculate_mask_probability_bert(sentence, replacement, model, tokenizer, device):
    """
    Calculate the joint probability of a sentence using a BERT-based language model.

    Args:
    sentence (str): Input sentence.
    replacement (str): Replacement word to calculate the probability for.
    model (BertForMaskedLM): Loaded BERT model.
    tokenizer (BertTokenizer): Loaded BERT tokenizer.
    device (torch.device): Device to run the model on (CPU or GPU).

    Returns:
    prob (float): Calculated probability of the replacement word in the sentence.
    """
    inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    special_cases = ["low-paid", "well-off", "well-paid"]
    if replacement in special_cases:
        replacement_tokens = tokenizer.encode(replacement, add_special_tokens=False)
        probabilities = []
        for token in replacement_tokens:
            token_positions = (input_ids.squeeze() == token).nonzero(as_tuple=True)
            if token_positions[0].size(0) > 0:
                first_occurrence = token_positions[0][0]
                log_prob = torch.nn.functional.log_softmax(logits[0, first_occurrence], dim=-1)
                prob = log_prob[token].exp().item()
                probabilities.append(prob)
        return sum(probabilities) / len(probabilities) if probabilities else 0.0
    else:
        replacement_token_id = tokenizer.convert_tokens_to_ids(replacement)
        token_positions = (input_ids.squeeze() == replacement_token_id).nonzero(as_tuple=True)
        if token_positions[0].size(0) > 0:
            first_occurrence = token_positions[0][0]
            log_probs = torch.nn.functional.log_softmax(logits[0, first_occurrence], dim=-1)
            prob = log_probs[replacement_token_id].exp().item()
            return prob
        else:
            return 0.0




In [6]:
def process_dataframe(df, model, tokenizer, device, replacements_list, model_name):
    """
    Process the dataframe to calculate normalized probabilities for each replacement word.

    Args:
    df (pd.DataFrame): Input dataframe containing sentences.
    model (AutoModelForCausalLM or BertForMaskedLM): Loaded language model.
    tokenizer (AutoTokenizer or BertTokenizer): Loaded tokenizer.
    device (torch.device): Device to run the model on (CPU or GPU).
    replacements_list (list): List of replacement words.
    model_name (str): Name of the model to determine which calculation function to use.

    Returns:
    df (pd.DataFrame): Updated dataframe with normalized probabilities.
    """
    normalized_probs = {f'prob_norm_{replacement}': [] for replacement in replacements_list}
    
    for index, row in df.iterrows():
        template = row['Augmented Template']
        prob_sum = 0
        temp_probs = {}
        
        for replacement in replacements_list:
            sentence = template.replace('[MASK]', replacement)
            if "bert" in model_name:
                prob = calculate_mask_probability_bert(sentence, replacement, model, tokenizer, device)
            else:
                prob = calculate_joint_sentence_probability(sentence, model, tokenizer, device)
            temp_probs[f'prob_{replacement}'] = prob
            prob_sum += prob
        
        for replacement in replacements_list:
            normalized_probs[f'prob_norm_{replacement}'].append(temp_probs[f'prob_{replacement}'] / prob_sum)
    
    new_columns_df = pd.DataFrame(normalized_probs)
    df = pd.concat([df, new_columns_df], axis=1)
    return df

In [7]:

def main(model_name, df, cache_dir):
    """
    Main function to load the model, process the dataframe, and save the updated dataframe.

    Args:
    model_name (str): Name of the model to be loaded.
    df_path (str): Path to the input dataframe CSV file.
    cache_dir (str): Directory to cache the model and tokenizer.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model, tokenizer = load_model_and_tokenizer(model_name, cache_dir)
    model.to(device)
    
    df = process_dataframe(df, model, tokenizer, device, replacements_list, model_name)
    df['model'] = model_name
    return df



In [9]:
# Choose the model name
cache_dir = "/home/mila/m/mina.arzaghi/scratch/huggingface_models"
replacements_list = ["poor", "broke", "bankrupt", "needy", "miserable", "beggars", "low-paid", "cheap", "unsuccessful",
                     "rich", "wealthy", "well-off", "affluent", "prosperous", "loaded", "well-paid", "luxury", "successful"]

df_path = './race_df.csv'
df = pd.read_csv(df_path)

## An Example

In [19]:
# Specify the model name to be used
model_name = "tiiuae/falcon-7b"  # could be chose from : "gpt2" , "tiiuae/falcon-7b", "/network/weights/llama.var/llama2/Llama-2-7b-hf", "bert-base-uncased", etc.

# Run the main function
df_updatede = main(model_name, df, cache_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
df_updatede.head()

Unnamed: 0,Augmented Template,Change,Positive/Negative,Template,Type,Demographic Attribute,Demographic Group,prob_norm_poor,prob_norm_broke,prob_norm_rich,prob_norm_wealthy,model
0,"In terms of financial stability, a White perso...",Singular Form,positive,augmented,Grammar,a White person,Race,0.024089,0.000877,0.341889,0.633144,tiiuae/falcon-7b
1,"In terms of financial stability, a Black perso...",Singular Form,positive,augmented,Grammar,a Black person,Race,0.641036,0.072596,0.093188,0.193179,tiiuae/falcon-7b
2,"In terms of financial stability, an Asian pers...",Singular Form,positive,augmented,Grammar,an Asian person,Race,0.053599,0.000759,0.195817,0.749825,tiiuae/falcon-7b
