In [18]:
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import pandas as pd
from torch.utils.data import Dataset
import torch
#pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.max_columns', None)


In [5]:
BERT_model = BertForSequenceClassification.from_pretrained("../Models/First_BERT/")

In [None]:
file_path = '../Data/Policy_docs/USA/Fairness, Ethics, Accountability, and Transparency/Fairness, Ethics, Accountability, and Transparency.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

policy_text = text
policy_text = policy_text.replace('\n', '')

policy_sentences = policy_text.split('.')
policy_sentences = [s.strip() for s in policy_sentences if s.strip()]
policy_sentences = [s for s in policy_sentences if '@' not in s]

# Add test sentences for comparison (generated by ChatGPT)
policy_sentences.append("The AI system shall treat all genders with equal respect, ensuring that its algorithms and decision-making processes do not reinforce or perpetuate traditional gender stereotypes")
policy_sentences.append("Our AI must be designed to avoid any form of sexual bias, ensuring that outputs and decisions do not discriminate based on sexual orientation or identity")
policy_sentences.append("The system is required to respect diverse religious beliefs, ensuring that no decision or content produced discriminates against or favors any particular faith")
policy_sentences.append("All data sources and model outputs will be routinely audited to prevent racial bias, ensuring that individuals from every racial and ethnic background receive equitable treatment")
policy_sentences.append("Our AI models shall incorporate universal design principles and undergo rigorous testing to ensure that they do not disadvantage individuals with disabilities, thereby promoting accessibility and inclusion")

policy_sentences

["NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVENUEALEXANDRIA, VIRGINIA 22314NSF 19-016Dear Colleague Letter: Fairness, Ethics, Accountability, andTransparency: Enabling Breakthrough Research to ExpandInclusivity in Computer and Information Science and EngineeringResearchNovember 2, 2018Dear Colleagues:The National Science Foundation's (NSF) Directorate for Computer and Information Scienceand Engineering (CISE) is committed to maximizing the positive consequences of theresearch that it funds through inclusive research approaches",
 "Indeed, a key component ofCISE's mission is to contribute to universal, transparent, and affordable participation in aninformation-based society",
 'Some research practices and methods may carry biases andinequities that can in turn have significant impacts on the scientific community and broadersociety',
 'The increased reliance on computing and information technologies may furtherincrease and automate such biases and inequities',
 'Professional societies,

In [36]:
df = pd.DataFrame(policy_sentences, columns=["comment_text"])

# Load your tokenizer (ensure you're using the same one as before)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize each sentence with a maximum token count of 128
df["tokenized"] = df["comment_text"].apply(
    lambda text: tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        add_special_tokens=True
    )
)

In [37]:
class SentenceDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.reset_index(drop=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokenized = row['tokenized']
        input_ids = torch.tensor(tokenized['input_ids'])
        attention_mask = torch.tensor(tokenized['attention_mask'])
        # Some tokenizers include token_type_ids; if available, include them
        if 'token_type_ids' in tokenized:
            token_type_ids = torch.tensor(tokenized['token_type_ids'])
        else:
            token_type_ids = None

        if token_type_ids is not None:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }

# Create the dataset
sentence_dataset = SentenceDataset(df)

In [38]:
sentence_dataset

<__main__.SentenceDataset at 0x1d2c26d7620>

In [39]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    do_train=False,   # Not needed if only predicting
    do_eval=False
)

trainer = Trainer(
    model=BERT_model,
    args=training_args
)

In [40]:
predictions_output = trainer.predict(sentence_dataset)

In [41]:
probabilities = torch.sigmoid(torch.tensor(predictions_output.predictions))

In [None]:
bias_columns = ['gender_bias', 'sexual_bias', 'religion_bias', 'race_bias', 'disability_bias']
predictions_df = pd.DataFrame(probabilities, columns=bias_columns)

df_with_predictions = pd.concat([df.reset_index(drop=True), predictions_df], axis=1)
#df_with_predictions[bias_columns] = df_with_predictions[bias_columns].mask(df_with_predictions[bias_columns] < 0.1, 0)
df_with_predictions

Unnamed: 0,comment_text,tokenized,gender_bias,sexual_bias,religion_bias,race_bias,disability_bias
0,NATIONAL SCIENCE FOUNDATION2415 EISENHOWER AVE...,"[input_ids, token_type_ids, attention_mask]",0.024321,0.011548,0.023554,0.021925,0.015728
1,"Indeed, a key component ofCISE's mission is to...","[input_ids, token_type_ids, attention_mask]",0.022481,0.014483,0.021137,0.020957,0.019055
2,Some research practices and methods may carry ...,"[input_ids, token_type_ids, attention_mask]",0.01638,0.018324,0.019797,0.033841,0.024535
3,The increased reliance on computing and inform...,"[input_ids, token_type_ids, attention_mask]",0.020726,0.023955,0.021072,0.031313,0.032292
4,"Professional societies, national and global co...","[input_ids, token_type_ids, attention_mask]",0.021534,0.014192,0.021632,0.025622,0.018468
5,"Codes ofethics, for example, have been establi...","[input_ids, token_type_ids, attention_mask]",0.024408,0.011694,0.016915,0.030802,0.019974
6,Somecodes or standards are addressing privacy ...,"[input_ids, token_type_ids, attention_mask]",0.01525,0.024883,0.023107,0.035318,0.02739
7,"Others emphasize theneed to ensure that users,...","[input_ids, token_type_ids, attention_mask]",0.026287,0.014737,0.017973,0.02286,0.021551
8,Standardsand guidelines have also been establi...,"[input_ids, token_type_ids, attention_mask]",0.018079,0.016242,0.031022,0.017905,0.021546
9,"With this Dear Colleague Letter (DCL), CISE in...","[input_ids, token_type_ids, attention_mask]",0.026383,0.014114,0.032447,0.031014,0.028624
