In [1]:
#BERT
from transformers import AutoModel, AutoTokenizer, BertForMaskedLM, BertTokenizerFast
import torch
import os
import xml.etree.ElementTree as ET
import re
import random
import tqdm


from torch import nn
from torch.nn import functional as F
from torch import optim
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.metrics import f1_score, accuracy_score

In [2]:
model_name = "bert-base-uncased"

In [3]:
tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [4]:
_config = {
    "path_train":"../../Data/train/en",
    "path_test":"../../Data/test/en",
    "lr":2e-5,
    "epochs":6,
    "train_batch_size":32,
    "test_batch_size":32,
    "n":20
}

In [5]:
class dataset(Dataset):
    def __init__(self, root_path, X, y=None, context=30):
        self.root_path = root_path
        self.X = X
        self.y = y
        self.context = context
               
        self.get_classes()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        new_path = os.path.join(self.root_path, self.X[idx])
        tweets = self.get_text(new_path)
        sample = " ".join(random.sample(tweets, self.context))
        encoded_msg = torch.tensor(tokenizer.encode(sample, max_length=512, pad_to_max_length=True))
        label = torch.tensor(self.cl[self.y[idx]])
        
        return {"text": encoded_msg, "attention": (encoded_msg!=0).float(), "label":label}
    def normalize(self, txt):
        txt = re.sub(r'https\S+', '', txt).lower()
        txt = re.sub("&amp;", "and", txt)
        txt = re.sub("#", "", txt)
        txt = re.sub("@\S+", "", txt)
        return txt
    def get_text(self, path):
        return [self.normalize(r.text) for r in ET.parse(path).getroot()[0]]
    def get_classes(self):
        cl = sorted(list(set(self.y)))
        self.cl = dict(zip(cl, range(len(cl))))

In [6]:
def get_raw_metadata(path, task = 0):
    raw_txt = [x.split(':::') for x in open(os.path.join(path,'truth.txt')).read().split('\n')[:-1]]
    return [x[0]+'.xml' for x in raw_txt], [x[task+1] for x in raw_txt]

In [7]:
class BERTForSequenceClassification(nn.Module):
    def __init__(self, model_name=model_name):
        super(BERTForSequenceClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        
        self.drop = nn.Dropout(0.1)
        self.clf  = nn.Linear(768, 2, bias=True)#768
    def forward(self, x, att):
        x = self.bert(x, attention_mask = att)[1]
        #x = self.drop(x)
        x = self.clf(x)
        return x

In [8]:
def train_model(name, path, _config):
    
    model = BERTForSequenceClassification().to(device)
    train_batches = DataLoader(train, batch_size=_config["train_batch_size"], shuffle=True, num_workers=10)

    lr = _config["lr"]
    max_grad_norm = 1.0

    optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
    epochs = _config["epochs"]
    total_steps = len(train_batches) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

    criterio = nn.CrossEntropyLoss()

    for epoch in tqdm.tqdm(range(epochs)):
        for i, sample in enumerate(train_batches):
            optimizer.zero_grad()
            x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
            y_pred = model(x, att)
            loss = criterio(y_pred, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
    torch.save(model.state_dict(), path+'/'+name+'.pt')

In [9]:
def test_model(model, test_batches):
    model.eval()
    y_t=[]
    y_p=[]
    logits = []
    for i, sample in tqdm.tqdm(enumerate(test_batches)):
        x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
        y_pred = F.softmax(model(x, att).cpu().detach(),1)
        logits.append(y_pred)
        y_pred = y_pred.argmax(1)
        y_p.append(y_pred)
        y_t.append(y.cpu())
    logits = torch.cat(logits)
    y_p=torch.cat(y_p)
    y_t=torch.cat(y_t)
    criterioNoCuda = nn.CrossEntropyLoss()
    model.train()
    return accuracy_score(y_t,y_p), criterioNoCuda(logits, y_t)

def pred(model, test_batches):
    model.eval()
    y_t=[]
    logits = []
    for i, sample in enumerate(test_batches):
        x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
        y_pred = F.softmax(model(x, att).cpu().detach(),1)
        logits.append(y_pred)
        y_t.append(y.cpu())
    logits = torch.cat(logits)
    y_t=torch.cat(y_t)
    model.train()
    return logits, y_t

def get_truth(test_batches):
    y_t=[]
    for i, sample in enumerate(test_batches):
        y=sample["label"]
        y_t.append(y.cpu())
    y_t=torch.cat(y_t)
    return y_t

def get_logits(model, test_batches):
    model.eval()
    logits = []
    for i, sample in enumerate(test_batches):
        x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
        y_pred = F.softmax(model(x, att).cpu().detach(),1)
        logits.append(y_pred)
    logits = torch.cat(logits)
    model.train()
    return logits

In [10]:
def eval_model(model, test_batches, n=5):
    y_p=None
    y_t=None
    for _ in range(n):
        y, y_t = pred(model, test_batches)
        if y_p==None:
            y_p=y
        else:
            y_p = y_p + y
        print(accuracy_score(y_t, y_p.argmax(1)))
    y_p = y_p/n
    return y_p, y_t, accuracy_score(y_t, y_p.argmax(1))

In [11]:
device = torch.device("cuda:1")

train_paths = get_raw_metadata(_config["path_train"])
train = dataset(_config["path_train"], train_paths[0], train_paths[1])
train_batches = DataLoader(train, batch_size=_config["train_batch_size"], shuffle=True, num_workers=10)

test_paths = get_raw_metadata(_config["path_test"])
test = dataset(_config["path_test"], test_paths[0], test_paths[1])
test_batches = DataLoader(test, batch_size=_config["test_batch_size"], shuffle=False, num_workers=10)

In [12]:
train_model("model", "Models", _config)

100%|██████████| 6/6 [11:34<00:00, 115.70s/it]


In [13]:
# Load Model
model = BERTForSequenceClassification().to(device)
model.load_state_dict(torch.load("Models/model.pt"))

<All keys matched successfully>

In [None]:
eval_model(model, test_batches, _config["n"])[2]

0.8020833333333334
0.8195833333333333
