In [1]:
from transformers import BertTokenizer, BertForNextSentencePrediction, AdamW
import torch
import pandas as pd
import random
from tqdm import tqdm
import os
import numpy as np
import re
from torch.nn.functional import softmax
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"PyTorch version: {torch.__version__}")  
device = "mps" if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

PyTorch version: 1.13.1
Using device: mps


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Only takes tweet with multiple line

In [4]:
def processCsv(file):
    texts = []
    text = ""
    #line = 0
    try:
        for index,row in file.iterrows():
          tweet = row.loc['content']
          if '\n' in tweet:
            texts.append(row.loc['content'])
            #line = line+1
    except UnicodeDecodeError:
      print(f"Unicode error for this file {file}")
    return texts, len(texts)
    

In [5]:
file_path = 'tweet_dataset/male refugee_tweet.csv'
templates = pd.read_csv(file_path, sep=",")
texts, rows = processCsv(templates.copy())

FileNotFoundError: [Errno 2] No such file or directory: 'tweet_dataset/male refugee_tweet.csv'

In [None]:
bag = [item for sentence in texts for item in sentence.split('.') if item != '']
bag_size = len(bag)
bag_size

84

In [None]:
sentence_a = []
sentence_b = []
label = []

for tweet in texts:
    sentences = [
        sentence for sentence in tweet.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [None]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

1


Talking of lies
---
This male allegedly fled from Afghanistan after his family were killed

He likely travelled through 6 or 7 safe EU countries, only to almost drown in the English channel, why? 

Oh, apparently he is 12! 

   

0
 applications by Turks climbed to a new all-time high in October (7 400), continuing the steep rises in recent months: 

Three in four applicants were male, acc
---
 to Eurostat

1
 "male refugees"
Arrived from where? 
No clue, they tore up travel documents
---
 23% of all Ukrainian refugees are male aged between 20 and 60



In [None]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([label]).T

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-6)



In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 2/2 [00:25<00:00, 12.54s/it, loss=2.12]
Epoch 1: 100%|██████████| 2/2 [00:24<00:00, 12.23s/it, loss=2.03]


In [None]:
class IntersentenceEvaluator():
    def __init__(self, data, choices, model_name, model):
        self.data = data
        self.choices = choices
        self.model = model
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForNextSentencePrediction.from_pretrained(model_name)
        self.encodings = self.make_encodings()
    
    #Function to make predictions and calculate how often the biased sentences are chosen
    def run_model_and_evaluate(self):
        output = self.make_predictions()
        self.get_bias(output)
        
    def make_encodings(self):
        sent_encoding = []
        for index,row in self.data.iterrows():
            _sent_encoding = []
            #print(f"index: {index}")
            #print(f"row: {row}")
            for c in self.choices.keys():
                encoding = self.tokenizer(row.loc['sentence'], row.loc[c], return_tensors="pt")
                #print(f"row.loc['sentence']: {row.loc['sentence']}")
                #print(f"row.loc[c]: {row.loc[c]}")
                _sent_encoding.append(encoding)
            #print(f"_sent_encoding: {_sent_encoding}")
            sent_encoding.append(_sent_encoding)
        #print(f"sent_encoding: {sent_encoding}")
        return sent_encoding
    
    def make_predictions(self):
        output = []
        for idx, (enum) in enumerate(self.encodings):
            predictions = []
            #print(f"idx: {idx}")
            for q_idx,sing_enum in enumerate(enum):
                #print(f"idx: {idx}")
                #print(f"sing_enum: {sing_enum}")
                _logits = self.model(**sing_enum, labels=torch.LongTensor([1])).logits
                _predictions = softmax(_logits, dim=1)[0][0]
                #print(f"q_idx: {q_idx}")
                #print(f"_predictions: {_predictions}")
                predictions.append(_predictions.item())
                #print(f"predictions: {predictions}")
            output.append(np.argmax(predictions) + 1)
            #print(f"output: {output}")
        return output
    
    def get_bias(self,predictions):
        #print(f"predictions: {predictions}")
        biased, unbiased, unrelated = 0, 0, 0
        for pred in predictions:
            if pred == 1:
                biased +=1
            if pred == 2:
                unbiased += 1
            if pred == 3:
                unrelated += 1
        print(f"biased: {biased}")
        print(f"unbiased: {unbiased}")
        print(f"unrelated: {unrelated}")

In [None]:
choices = {'bias':1, 'unbiased':2,'unrelated':3} 
file_path = 'dataset/male_refugee_inter.csv' #"drive/MyDrive/Final_templates.csv"#"drive/MyDrive/New_templates.csv"
model_name = "bert-base-uncased"
templates = pd.read_csv(file_path, sep=";")
evaluate = IntersentenceEvaluator(templates.copy(), choices, model_name, model)
evaluate.run_model_and_evaluate()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


biased: 2
unbiased: 2
unrelated: 0
