## Toponym Interlinking using Bert

Load the dataset files. Split to train, val and test datasets.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_csv = 'data/geonames_1.csv'
test_csv = 'data/geonames_2.csv'

train_df = pd.read_csv(train_csv, delimiter='|', names=['s1', 's2', 'label'])
train_df = train_df.dropna().reset_index(drop=True)

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=2020)

test_df = pd.read_csv(test_csv, delimiter='|', names=['s1', 's2', 'label'])
test_df = test_df.dropna().reset_index(drop=True)

print('Number of train instances:', train_df.shape[0])
print('Number of val instances:', val_df.shape[0])
print('Number of test instances:', test_df.shape[0])

Number of train instances: 1999994
Number of val instances: 499999
Number of test instances: 2499991


Save specific datasets in order to use them with the rest of approaches.

In [2]:
train_df.to_csv('data/train.csv', index=False)
val_df.to_csv('data/val.csv', index=False)
test_df.to_csv('data/test.csv', index=False)

Extract pair instances as well as their labels.

In [3]:
train_texts = train_df[['s1', 's2']].values
val_texts = val_df[['s1', 's2']].values
test_texts = test_df[['s1', 's2']].values

train_labels = train_df['label'].values
val_labels = val_df['label'].values
test_labels = test_df['label'].values

Encode toponym pairs using BertTokenizer. This will create the necessary components in order to be used as input to the Bert model.

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_encoded_dict = [tokenizer.encode_plus(text=text1, text_pair=text2, max_length=128, pad_to_max_length=True)
                      for text1, text2 in train_texts]
val_encoded_dict = [tokenizer.encode_plus(text=text1, text_pair=text2, max_length=128, pad_to_max_length=True)
                    for text1, text2 in val_texts]
test_encoded_dict = [tokenizer.encode_plus(text=text1, text_pair=text2, max_length=128, pad_to_max_length=True)
                     for text1, text2 in test_texts]

In [None]:
train_input_ids = [d['input_ids'] for d in train_encoded_dict]
train_token_type_ids = [d['token_type_ids'] for d in train_encoded_dict]
train_attention_masks = [d['attention_mask'] for d in train_encoded_dict]

val_input_ids = [d['input_ids'] for d in val_encoded_dict]
val_token_type_ids = [d['token_type_ids'] for d in val_encoded_dict]
val_attention_masks = [d['attention_mask'] for d in val_encoded_dict]

test_input_ids = [d['input_ids'] for d in test_encoded_dict]
test_token_type_ids = [d['token_type_ids'] for d in test_encoded_dict]
test_attention_masks = [d['attention_mask'] for d in test_encoded_dict]

Convert everything to tensors.

In [None]:
import torch

train_input_ids = torch.tensor(train_input_ids)
train_token_type_ids = torch.tensor(train_token_type_ids)
train_attention_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)

val_input_ids = torch.tensor(val_input_ids)
val_token_type_ids = torch.tensor(val_token_type_ids)
val_attention_masks = torch.tensor(val_attention_masks)
val_labels = torch.tensor(val_labels)

test_input_ids = torch.tensor(test_input_ids)
test_token_type_ids = torch.tensor(test_token_type_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

Prepare the dataloaders.

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_input_ids, train_token_type_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_input_ids, val_token_type_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

test_data = TensorDataset(test_input_ids, test_token_type_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

Model definition. We will use BertForSequenceClassification to carry out the binary classification task.

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Definition of loss function and optimizer.

In [None]:
from torch import nn
from transformers import AdamW

loss_func = nn.BCEWithLogitsLoss()
loss_func.to(device)

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Use seed to make the experiment reproducible.

In [None]:
import numpy as np
import random

seed_val = 2020

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Training code.

In [None]:
%%time

for epoch_i in range(0, epochs):
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_attention_masks = batch[2].to(device)
        b_labels = batch[3].to(device)
        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids,
                        attention_mask=b_attention_masks, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)

Run val data though the trained model to extract predictions.

In [None]:
model.eval()
val_preds, val_true_labels = [], []

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_attention_masks, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids,
                        attention_mask=b_attention_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    val_preds.append(logits)
    val_true_labels.append(label_ids)

Save val predictions in order to use them in the ensemble methods. Calculate val accuracy score.

In [None]:
from sklearn.metrics import accuracy_score

val_preds = [item for sublist in val_preds for item in sublist]
np.save('preds/bert_val_preds.npy', np.array(val_preds))
val_preds = np.argmax(val_preds, axis=1).flatten()
val_true_labels = [item for sublist in val_true_labels for item in sublist]

val_acc = accuracy_score(val_true_labels, val_preds)

print('Validation Accuracy: %.3f' % val_acc)

Run test data through the trained model to extract predictions.

In [None]:
model.eval()
test_preds, test_true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_token_type_ids, b_attention_masks, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids,
                        attention_mask=b_attention_masks)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    test_preds.append(logits)
    test_true_labels.append(label_ids)

Save test predictions in order to use them in the ensemble methods. Calculate test accuracy score.

In [None]:
test_preds = [item for sublist in test_preds for item in sublist]
np.save('preds/bert_test_preds.npy', np.array(test_preds))
test_preds = np.argmax(test_preds, axis=1).flatten()
test_true_labels = [item for sublist in test_true_labels for item in sublist]

test_acc = accuracy_score(test_true_labels, test_preds)

print('Test Accuracy: %.3f' % test_acc)