In [None]:
import shutil

folder_path = "/content/AI_Policy_Thesis"

try:
    shutil.rmtree(folder_path)
    print(f"The directory {folder_path} and all its contents have been deleted.")
except FileNotFoundError:
    print(f"The directory {folder_path} does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")

The directory /content/AI_Policy_Thesis and all its contents have been deleted.


In [19]:
import os

# Set to a valid working directory
os.chdir("/content")  # Default directory in Colab
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content


In [20]:
# COLAB GIT CLONE
!git clone https://github.com/JochemBus/AI_Policy_Thesis.git

Cloning into 'AI_Policy_Thesis'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 38 (delta 17), reused 24 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (38/38), 1.78 MiB | 4.24 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [21]:
import pandas as pd
from transformers import AutoTokenizer
import json
import torch
from sklearn.model_selection import train_test_split



#pd.set_option('display.max_colwidth', None)  # Show full column content

pd.set_option('display.max_columns', None)

In [22]:
bias_df = pd.read_csv("/content/AI_Policy_Thesis/filtered_bias_data.csv")
#bias_df = pd.read_csv("filtered_bias_data.csv")

bias_df

Unnamed: 0,id,comment_text,gender_bias,sexual_bias,religion_bias,race_bias,disability_bias
0,1047401,Yo this guy is LAME! I would be so frustrated ...,0,0,0,0,0
1,6129125,As Jehovah's Witnesses do not believe in blood...,0,0,0,0,0
2,929666,You can read stuff like that in the Old Testam...,0,0,0,0,0
3,5114785,It's_Harry_Mudd_in_reverse.__A._Everything_the...,0,0,0,0,0
4,5255907,Hmmm but in the Gospel Jesus tells us to eat H...,0,0,1,0,0
...,...,...,...,...,...,...,...
6347,4986948,"In the RCC ALL authority, pronouncements, cano...",1,0,1,0,0
6348,524509,Does anyone else see the pattern of promises b...,0,1,0,0,0
6349,972988,Dismantling of all government including those ...,0,0,1,0,0
6350,5450450,We got Girls Gone Wild and Georgia got Hog Wild,1,0,0,0,0


In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
token_distribution_df = pd.DataFrame()
# Compute the token length for each comment
# Using `encode` adds special tokens by default (e.g., [CLS], [SEP])
token_distribution_df['token_length'] = bias_df['comment_text'].apply(
    lambda x: len(tokenizer.encode(x, add_special_tokens=True))
)

print("Token Length Distribution:")
print(token_distribution_df['token_length'].describe())

Token Length Distribution:
count    6352.000000
mean      108.192538
std        67.768221
min         4.000000
25%        49.000000
50%        94.000000
75%       166.000000
max       365.000000
Name: token_length, dtype: float64


In [24]:
# Tokenize with a max token count of 128
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bias_df["tokenized"] = bias_df["comment_text"].apply(lambda text: tokenizer(
    text,
    padding="max_length",
    truncation=True,
    max_length=128,
    add_special_tokens=True
))

In [25]:
bias_df

Unnamed: 0,id,comment_text,gender_bias,sexual_bias,religion_bias,race_bias,disability_bias,tokenized
0,1047401,Yo this guy is LAME! I would be so frustrated ...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
1,6129125,As Jehovah's Witnesses do not believe in blood...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
2,929666,You can read stuff like that in the Old Testam...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
3,5114785,It's_Harry_Mudd_in_reverse.__A._Everything_the...,0,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"
4,5255907,Hmmm but in the Gospel Jesus tells us to eat H...,0,0,1,0,0,"[input_ids, token_type_ids, attention_mask]"
...,...,...,...,...,...,...,...,...
6347,4986948,"In the RCC ALL authority, pronouncements, cano...",1,0,1,0,0,"[input_ids, token_type_ids, attention_mask]"
6348,524509,Does anyone else see the pattern of promises b...,0,1,0,0,0,"[input_ids, token_type_ids, attention_mask]"
6349,972988,Dismantling of all government including those ...,0,0,1,0,0,"[input_ids, token_type_ids, attention_mask]"
6350,5450450,We got Girls Gone Wild and Georgia got Hog Wild,1,0,0,0,0,"[input_ids, token_type_ids, attention_mask]"


In [26]:
bias_df["tokenized"].iloc[0]

{'input_ids': [101, 10930, 2023, 3124, 2003, 20342, 999, 1045, 2052, 2022, 2061, 10206, 1998, 17733, 2941, 1012, 1001, 2879, 3762, 2063, 1012, 2025, 2000, 2360, 2009, 1005, 1055, 2025, 4276, 20888, 2055, 2021, 2272, 2006, 999, 2008, 2052, 2425, 2033, 2035, 1045, 2428, 2734, 2000, 2113, 2055, 1996, 3124, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [27]:
bias_df, bias_eval = train_test_split(bias_df, test_size=0.3, random_state=42)

## Fine Tune BERT Model

In [28]:

from torch.utils.data import Dataset

class BiasDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.reset_index(drop=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        tokenized = row['tokenized']

        # Convert each element to a torch tensor
        input_ids = torch.tensor(tokenized['input_ids'])
        attention_mask = torch.tensor(tokenized['attention_mask'])

        # Some tokenizers might not include token_type_ids if not needed.
        if 'token_type_ids' in tokenized:
            token_type_ids = torch.tensor(tokenized['token_type_ids'])
        else:
            token_type_ids = None

        # Create a tensor for the 5 bias labels (using float for BCEWithLogitsLoss)
        labels = torch.tensor([
            row['gender_bias'],
            row['sexual_bias'],
            row['religion_bias'],
            row['race_bias'],
            row['disability_bias']
        ], dtype=torch.float)

        # Return a dictionary that will be used as model inputs.
        if token_type_ids is not None:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
                'labels': labels
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }

In [29]:
train_dataset = BiasDataset(bias_df)
eval_dataset = BiasDataset(bias_eval)

In [31]:
from transformers import BertForSequenceClassification, BertConfig


model_name = "bert-base-uncased"
num_labels = 5

# Create a configuration tailored for multi-label classification
config = BertConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# Load the pre-trained model with the custom configuration
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


True


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [32]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Device count:", torch.cuda.device_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

print("PyTorch version:", torch.__version__)
print("CUDA version reported by PyTorch:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())


CUDA available: True
CUDA version: 12.4
Device count: 1
Current device: 0
Device name: Tesla T4
PyTorch version: 2.5.1+cu124
CUDA version reported by PyTorch: 12.4
CUDA available: True


In [33]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    #output_dir="/content/AI_Policy_Thesis/results",            # Directory to save model checkpoints
    run_name = "BERT_FineTune",
    num_train_epochs=3,
    per_device_train_batch_size=8,    # Batch size per device (GPU/CPU)
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="epoch"           # Set to "steps" or "epoch" if you add an eval dataset
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

print(next(model.parameters()).device)

cuda:0




In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2171,0.19148
2,0.1657,0.167774
3,0.1218,0.166964


TrainOutput(global_step=1668, training_loss=0.20187629841500335, metrics={'train_runtime': 438.5996, 'train_samples_per_second': 30.41, 'train_steps_per_second': 3.803, 'total_flos': 877367446046208.0, 'train_loss': 0.20187629841500335, 'epoch': 3.0})

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
trainer.save_model("/content/drive/MyDrive/JADS/Thesis/Models/Second_BERT")