In [None]:
import shutil

folder_path = "/content/AI_Policy_Thesis"  # Update the path if it's located elsewhere

try:
    shutil.rmtree(folder_path)
    print(f"The directory {folder_path} and all its contents have been deleted.")
except FileNotFoundError:
    print(f"The directory {folder_path} does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import os

# Set to a valid working directory
os.chdir("/content")  # Default directory in Colab
print(f"Current working directory: {os.getcwd()}")

In [None]:
# COLAB GIT CLONE
!git clone https://github.com/JochemBus/AI_Policy_Thesis.git

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import json
import torch



#pd.set_option('display.max_colwidth', None)  # Show full column content

pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
bias_df = pd.read_csv("/content/AI_Policy_Thesis/filtered_bias_data.csv")
#bias_df = pd.read_csv("filtered_bias_data.csv")

bias_df

Unnamed: 0,id,comment_text,gender_bias,sexual_bias,religion_bias,race_bias,disability_bias
0,1047401,Yo this guy is LAME! I would be so frustrated ...,0,0,0,0,0
1,6129125,As Jehovah's Witnesses do not believe in blood...,0,0,0,0,0
2,929666,You can read stuff like that in the Old Testam...,0,0,0,0,0
3,5114785,It's_Harry_Mudd_in_reverse.__A._Everything_the...,0,0,0,0,0
4,5255907,Hmmm but in the Gospel Jesus tells us to eat H...,0,0,1,0,0
...,...,...,...,...,...,...,...
6347,4986948,"In the RCC ALL authority, pronouncements, cano...",1,0,1,0,0
6348,524509,Does anyone else see the pattern of promises b...,0,1,0,0,0
6349,972988,Dismantling of all government including those ...,0,0,1,0,0
6350,5450450,We got Girls Gone Wild and Georgia got Hog Wild,1,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame
bias_df, bias_eval = train_test_split(bias_df, test_size=0.3, random_state=42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
token_distribution_df = pd.DataFrame()
# Compute the token length for each comment
# Using `encode` adds special tokens by default (e.g., [CLS], [SEP])
token_distribution_df['token_length'] = bias_df['comment_text'].apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=True))
)

print("Token Length Distribution:")
print(token_distribution_df['token_length'].describe())

Token Length Distribution:
count    1000.000000
mean      105.684000
std        67.400972
min         4.000000
25%        47.000000
50%        90.000000
75%       161.250000
max       304.000000
Name: token_length, dtype: float64


In [None]:
# Tokenize with a max token count of 128
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bias_df["tokenized"] = bias_df["comment_text"].apply(lambda text: tokenizer(
    text, 
    padding="max_length", 
    truncation=True, 
    max_length=128,
    add_special_tokens=True
))

In [None]:
bias_df

Unnamed: 0,id,comment_text,gender_bias,sexual_bias,religion_bias,race_bias,disability_bias,tokenized
1834937,660215,We have no obligation to foist our idea of wha...,0,1,0,0,0,"[input_ids, token_type_ids, attention_mask]"
1993611,696994,Putting aside the exaggerated and unrealistic ...,0,0,0,1,0,"[input_ids, token_type_ids, attention_mask]"
1971119,5963512,So when the NEXT plan was released and the pub...,1,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
1927678,6333555,Which you should normally get from your mother...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
1355238,5635048,"Sigh. You're such an absolutist, Prog. Unable ...",0,0,1,1,0,"[input_ids, token_type_ids, attention_mask]"
...,...,...,...,...,...,...,...,...
1643400,6325215,I would be surprised if Francis acted on somet...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
1856435,6076350,I'm not so sure about that. Yusra Khogali has ...,1,0,0,1,0,"[input_ids, token_type_ids, attention_mask]"
1834711,6113346,This is nothing more than a staged distraction...,0,0,1,0,0,"[input_ids, token_type_ids, attention_mask]"
1834537,7044877,"Story does not specify what ""ill""\nToo ill to ...",0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"


In [None]:
bias_df["tokenized"].iloc[0]

{'input_ids': [101, 2057, 2031, 2053, 14987, 2000, 1042, 10054, 2102, 2256, 2801, 1997, 2054, 2057, 5136, 2000, 2022, 1996, 4602, 2204, 2006, 2619, 2842, 1012, 10262, 2057, 2024, 7149, 2007, 6001, 1010, 2002, 2030, 2016, 2097, 2471, 5121, 24501, 4765, 2115, 19960, 14423, 1010, 2004, 2092, 2002, 2030, 2016, 2323, 1012, 2009, 1005, 1055, 3141, 2000, 2054, 6031, 1998, 4435, 17580, 1010, 1999, 2037, 3720, 1010, 1000, 1996, 2157, 2000, 9394, 1000, 2170, 1000, 1996, 2157, 2000, 2022, 2292, 2894, 1000, 1012, 2017, 2031, 4445, 1996, 2157, 4496, 1996, 4611, 2000, 2022, 1037, 5697, 23684, 1012, 2061, 1010, 2292, 2033, 9377, 2026, 3160, 1024, 2054, 2079, 2017, 2156, 2024, 1996, 5704, 1997, 5637, 2015, 1029, 4919, 1010, 2054, 5704, 2079, 2027, 2031, 2008, 1045, 1010, 2004, 1037, 28229, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Fine Tune BERT Model

In [None]:

from torch.utils.data import Dataset

class BiasDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.reset_index(drop=True)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Use the precomputed tokenized data
        # We assume that `row['tokenized']` is a dictionary-like object containing:
        #   - "input_ids"
        #   - "token_type_ids" (if applicable)
        #   - "attention_mask"
        tokenized = row['tokenized']
        
        # Convert each element to a torch tensor
        input_ids = torch.tensor(tokenized['input_ids'])
        attention_mask = torch.tensor(tokenized['attention_mask'])
        
        # Some tokenizers might not include token_type_ids if not needed.
        if 'token_type_ids' in tokenized:
            token_type_ids = torch.tensor(tokenized['token_type_ids'])
        else:
            token_type_ids = None
        
        # Create a tensor for the 5 bias labels (using float for BCEWithLogitsLoss)
        labels = torch.tensor([
            row['gender_bias'],
            row['sexual_bias'],
            row['religion_bias'],
            row['race_bias'],
            row['disability_bias']
        ], dtype=torch.float)
        
        # Return a dictionary that will be used as model inputs.
        if token_type_ids is not None:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
                'labels': labels
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }

In [None]:
train_dataset = BiasDataset(bias_df)

In [18]:
from transformers import BertForSequenceClassification, BertConfig


model_name = "bert-base-uncased"
num_labels = 5

# Create a configuration tailored for multi-label classification
config = BertConfig.from_pretrained(
    model_name, 
    num_labels=num_labels, 
    problem_type="multi_label_classification"
)

# Load the pre-trained model with the custom configuration
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


True


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
    
print("PyTorch version:", torch.__version__)
print("CUDA version reported by PyTorch:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())


CUDA available: True
CUDA version: 11.8
Device count: 1
Current device: 0
Device name: NVIDIA GeForce MX450


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/AI_Policy_Thesis/results",            # Directory to save model checkpoints
    run_name = "BERT_FineTune",
    num_train_epochs=3,              
    per_device_train_batch_size=8,    # Batch size per device (GPU/CPU)
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="no"           # Set to "steps" or "epoch" if you add an eval dataset
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=bias_eval,
)

print(next(model.parameters()).device)



In [23]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 