In [42]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

In [43]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
#load dataset from csv
import pandas as pd
dataset = "cities_augm"
data = pd.read_csv("../data/own_data/"+dataset + ".csv")
data.head()

Unnamed: 0,statement,label,city,country,correct_country
0,The metropolis Krasnodar is located in Russia.,1,Krasnodar,Russia,Russia
1,The Krasnodar be in South Africa.,0,Krasnodar,South Africa,Russia
2,The city of Lodz is located in Poland.,1,Lodz,Poland,Poland
3,The city of Lodz is located in the Dominican R...,0,Lodz,the Dominican Republic,Poland
4,City Maracay is in Venezuela.,1,Maracay,Venezuela,Venezuela


In [59]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

class CitiesDatasets(Dataset):
    def __init__(self, df):
        self.data = df.reset_index(drop=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        statement = self.data.loc[index, 'statement']
        label = self.data.loc[index, 'label']
        #tokenize the statement
        inputs = tokenizer(statement, return_tensors="pt", truncation=True, padding=True)
        statement = inputs['input_ids'].squeeze(0)
        return statement, inputs['attention_mask'].squeeze(0), label
    
def collate_batch(batch):
    input_ids, attention_mask, labels = zip(*batch)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    return input_ids,attention_mask, torch.tensor(labels) 

train_data , test_data = train_test_split(data, test_size=0.2)

train = CitiesDatasets(train_data)
test = CitiesDatasets(test_data)

train_loader = DataLoader(train, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test, batch_size=32, shuffle=False, collate_fn=collate_batch)

In [60]:
from torch.optim import AdamW
import tqdm

optimizer = AdamW(model.parameters(), lr=1e-5)
model.train()

#use tqdm for progress bar with loss and accuracy
best_acc =0
for epoch in range(10):
    total_loss = 0
    total_accuracy = 0
    n_samples = 0
    with tqdm.tqdm(train_loader, unit="batch") as t:
        for i, (statements, attention_mask, labels) in enumerate(t):
            t.set_description(f"Epoch {epoch+1}")
            optimizer.zero_grad()
            outputs = model(statements, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            accuracy = accuracy_score(labels, outputs.logits.argmax(1))
            total_loss += loss.item()
            total_accuracy += accuracy
            n_samples += 1
            t.set_postfix(loss=total_loss/n_samples, accuracy=total_accuracy/n_samples)
        #compute test accuracy
        model.eval()
        with torch.no_grad():
            all_labels = []
            all_preds = []
            all_loss = 0
            for statements, attention_mask, labels in test_loader:
                outputs = model(statements, attention_mask=attention_mask, labels=labels)
                all_labels.extend(labels)
                all_preds.extend(outputs.logits.argmax(1))
                all_loss += outputs.loss.item()
            test_acc = accuracy_score(all_labels, all_preds)
            print(f"Test loss: {all_loss/len(test_loader)}")
            print(f"Test accuracy: {test_acc}")
        if best_acc < test_acc :
            best_acc = test_acc
            model.save_pretrained("../model/"+"Bert_augm_cities") 
        model.train()

Epoch 1: 100%|██████████| 38/38 [00:32<00:00,  1.18batch/s, accuracy=0.916, loss=0.225]


Test loss: 0.13894415106624364
Test accuracy: 0.9433333333333334


Epoch 2: 100%|██████████| 38/38 [00:35<00:00,  1.08batch/s, accuracy=0.947, loss=0.122]


Test loss: 0.12405064860358835
Test accuracy: 0.9433333333333334


Epoch 3: 100%|██████████| 38/38 [00:35<00:00,  1.07batch/s, accuracy=0.969, loss=0.0792]


Test loss: 0.12414170932024718
Test accuracy: 0.9466666666666667


Epoch 4: 100%|██████████| 38/38 [00:36<00:00,  1.05batch/s, accuracy=0.984, loss=0.054] 


Test loss: 0.126890460960567
Test accuracy: 0.9533333333333334


Epoch 5: 100%|██████████| 38/38 [00:36<00:00,  1.04batch/s, accuracy=0.978, loss=0.0484]


Test loss: 0.17755506802350282
Test accuracy: 0.94


Epoch 6: 100%|██████████| 38/38 [00:37<00:00,  1.02batch/s, accuracy=0.989, loss=0.0326]


Test loss: 0.15939298036973923
Test accuracy: 0.9533333333333334


Epoch 7: 100%|██████████| 38/38 [00:36<00:00,  1.03batch/s, accuracy=0.989, loss=0.0373]


Test loss: 0.16519492778461425
Test accuracy: 0.95


Epoch 8: 100%|██████████| 38/38 [00:37<00:00,  1.02batch/s, accuracy=0.99, loss=0.038]  


Test loss: 0.14114328047726304
Test accuracy: 0.9533333333333334


Epoch 9: 100%|██████████| 38/38 [00:36<00:00,  1.05batch/s, accuracy=0.992, loss=0.0364]


Test loss: 0.15087447564583273
Test accuracy: 0.9466666666666667


Epoch 10: 100%|██████████| 38/38 [00:35<00:00,  1.07batch/s, accuracy=0.998, loss=0.0134]


Test loss: 0.14546704717213288
Test accuracy: 0.9466666666666667


In [96]:
# Load saved model
model = BertForSequenceClassification.from_pretrained("../model/"+"Bert_counterfact_cities")
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [103]:
import pandas as pd
dataset = "cities_augm"
data = pd.read_csv("../data/own_data/"+dataset + ".csv")
data.head()

Unnamed: 0,statement,label,city,country,correct_country
0,The metropolis Krasnodar is located in Russia.,1,Krasnodar,Russia,Russia
1,The Krasnodar be in South Africa.,0,Krasnodar,South Africa,Russia
2,The city of Lodz is located in Poland.,1,Lodz,Poland,Poland
3,The city of Lodz is located in the Dominican R...,0,Lodz,the Dominican Republic,Poland
4,City Maracay is in Venezuela.,1,Maracay,Venezuela,Venezuela


In [104]:
test = CitiesDatasets(data)
test_loader = DataLoader(test, batch_size=32, shuffle=False, collate_fn=collate_batch)

In [105]:
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print("Accuracy:", accuracy)

Accuracy: 0.9151069518716578
