In [None]:
!pip install -q datasets sentencepiece # to convert slow tokenizer to a fast one

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pytz, datetime
import os
import re
import random
import pandas as pd
import numpy as np
from sklearn import metrics
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("book")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset
import transformers
from transformers import get_linear_schedule_with_warmup #, AdamW

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package dependency_treebank is already up-to-date!
[nltk_data]    | Downloadi

In [None]:
seed = 42

# https://wandb.ai/sauravmaheshkar/RSNA-MICCAI/reports/How-to-Set-Random-Seeds-in-PyTorch-and-Tensorflow--VmlldzoxMDA2MDQy
def set_seed(seed: int = seed) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed()

In [None]:
timezone = pytz.timezone("Europe/Rome")
utc_now = pytz.utc.localize(datetime.datetime.utcnow())
pst_now = utc_now.astimezone(pytz.timezone("Europe/Rome"))

###Setting Configuration

In [None]:
mapping = {
    0: 'true',
    1: 'false'
}

config = {
    'learning_rate': 5e-4 ,
    'batch_size': 16,
    'epochs': 5,
    'dropout': 0.45,
    'tokenizer_max_len': 128
}

device = torch.device(0 if torch.cuda.is_available() else 'cpu')


#model_name = "bert-base-uncased"
#model_name = "microsoft/deberta-v2-xlarge-mnli" # Doesn't fit in memory
model_name = "roberta-base"
#model_name = "distilbert-base-uncased"


GOLD_LABEL = 'label'
TEXT = 'statement'
path = "/content/drive/MyDrive/Challenge NLP/models/" + model_name.replace("/", "-")
n_labels = len(mapping)

###Pre-processing

### Count tokens in statements

In [None]:
dataset = load_dataset("liar")

#0 - False
#1 - Half-true
#2 - Mostly-true
#3 - True
#4 - Barely-true
#5 - Pants-fire


def preprocessing(df):

    # Remove half-true
    #df = df[df['label'].isin([0,2,3,4,5])]

    # Remove half-true, mostly-true and barely-true
    df = df[df['label'].isin([0,3,5])]


    # Labels mapping:
    # barely-true -> false
    # pants-fire -> true
    # mostly-true -> true
    # half-true -> true
    df['label'] = df['label'].map({5: 3, 4: 0, 2: 3, 1: 3})


    # Text cleaning
    stop_words = set(stopwords.words('english'))

    for index, row in df.iterrows():
        # To lower case
        df.loc[index, "statement"] = df.loc[index, "statement"].lower()

        # Remove punctuation
        df.loc[index, "statement"] = re.sub(r'[^\w\s]', '', df.loc[index, "statement"])

        # Remove stopwords
        word_tokens = word_tokenize(df.loc[index, "statement"])
        filtered_sentence = [w for w in word_tokens if w not in stop_words]
        df.loc[index, "statement"] = ' '.join(filtered_sentence)

        # Remove number assuming a fake or true news is not usually determined by the numbers presented
        df.loc[index, "statement"] = re.sub("\d+", " ", df.loc[index, "statement"])

        # Remove extra spaces
        df.loc[index, "statement"] = re.sub("\s+", " ", df.loc[index, "statement"])



    df['label'] = ['true' if ele == 3 else 'false' for ele in df['label']]
    return df.dropna().reset_index()


train = preprocessing(pd.DataFrame(data=dataset['train']))
validation = preprocessing(pd.DataFrame(data=dataset['validation']))
test = eval = preprocessing(pd.DataFrame(data=dataset['test']))

print(len(train),len(eval),len(test)) #3681 461 461

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({5: 3, 4: 0, 2: 3, 1: 3})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = ['true' if ele == 3 else 'false' for ele in df['label']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({5: 3, 4: 0, 2: 3, 1: 3})
A value is trying to b

4523 553 553


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = ['true' if ele == 3 else 'false' for ele in df['label']]


In [None]:
train["statement"].head(2)

NameError: name 'train' is not defined

In [None]:
print("Number of false: " + str(train[train['label'] == "false"]['label'].count()))
print("Number of true: " + str(train[train['label'] == "true"]['label'].count()))
print("Number of barely-true: " + str(train[train['label'] == "barely-true"]['label'].count()))
print("Number of half-true: " + str(train[train['label'] == "half-true"]['label'].count()))
print("Number of mostly-true: " + str(train[train['label'] == "mostly-true"]['label'].count()))
print("Number of pants-fire: " + str(train[train['label'] == "pants-fire"]['label'].count()))

Number of false: 3681
Number of true: 842
Number of barely-true: 0
Number of half-true: 0
Number of mostly-true: 0
Number of pants-fire: 0


In [None]:
train.sample(1)

Unnamed: 0,index,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
2619,5918,2621.json,True,rick scott doesnt ties lobbyist,candidates-biography,jennifer-carroll,Lieutenant governor,Florida,republican,0.0,1.0,0.0,0.0,1.0,a speech


In [None]:
def mapping_dataset(dataset, mapping):
    for index,row in dataset.iterrows():
        number_label = [k for label in row[GOLD_LABEL].split(',') for k,v in mapping.items() if label.strip() == v]
        dataset.loc[index, GOLD_LABEL] = str(number_label)

    return dataset

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df)), desc='Loading:',disable=True):
        temp = [0]*n_labels
        label_indices = list(df.iloc[i][GOLD_LABEL][1:-1].split(', '))
        for index in label_indices:
            temp[int(index)] = 1
        one_hot_encoding.append(temp)

    return pd.DataFrame(one_hot_encoding)

map_train = mapping_dataset(train, mapping)
map_validation = mapping_dataset(validation, mapping)
train = pd.concat([map_train, one_hot_encoder(map_train)], axis=1)
valid = pd.concat([map_validation, one_hot_encoder(map_validation)], axis=1)


In [None]:
train.sample(1)

Unnamed: 0,index,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,0,1
569,1272,10087.json,[0],federal tax refunds delayed october,taxes,chain-email,,,none,11.0,43.0,8.0,5.0,105.0,a chain email,1,0


###Model Loading and Configuration

####Tokenizer

In [None]:
class LiarDataset:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                         None,
                                         add_special_tokens=True,
                                         max_length=self.max_len,
                                         padding="max_length",
                                         truncation=True,
                                         )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

####Model Architecture

In [None]:
class Classifier(nn.Module):
    def __init__(self, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes) # bert-base-uncased, roberta, distilbert
        #self.out = nn.Linear(1536, config["batch_size"]*config["tokenizer_max_len"]) # microsoft/deberta-v2-xlarge-mnli


    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"] # bert-base-uncased, roberta
        #output_1 = self.bert(ids, attention_mask=mask)["last_hidden_state"] # microsoft/deberta-v2-xlarge-mnli, distilbert
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [None]:
#bert_model = transformers.AutoModel.from_pretrained(model_name)
#model = Classifier(n_labels, config['dropout'], bert_model=bert_model)

####Training

In [None]:
def build_dataset(tokenizer_max_len, train, valid):
    train_dataset = LiarDataset(list(train[TEXT]), train[range(n_labels)].values.tolist(), tokenizer, tokenizer_max_len)
    valid_dataset = LiarDataset(list(valid[TEXT]), valid[range(n_labels)].values.tolist(), tokenizer, tokenizer_max_len)

    return train_dataset, valid_dataset

def build_dataloader(train_dataset, valid_dataset, batch_size):
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    return train_data_loader, valid_data_loader

def loss_function (outputs, labels):
    #return nn.BCELoss()(outputs, labels.float()) # https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html
    return nn.MultiLabelSoftMarginLoss()(outputs, labels.float()) # https://pytorch.org/docs/stable/generated/torch.nn.MultiLabelSoftMarginLoss.html

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()

    labels = labels.argmax(axis=1)
    pred_f1 = preds.argmax(axis = 1)

    return {"precision": metrics.precision_score(labels, pred_f1),
            "recall": metrics.recall_score(labels, pred_f1),
            "f1": metrics.f1_score(labels, pred_f1)}

def train_fn(data_loader, model, optimizer, device):
    train_loss = 0.0
    model.train()
    for d in tqdm(data_loader):
        ids = d["ids"].to(device, dtype=torch.long)
        mask = d["mask"].to(device, dtype=torch.long)
        targets = d["labels"].to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_function(outputs, targets)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    return train_loss

def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader)):
            ids = d["ids"].to(device, dtype=torch.long)
            mask = d["mask"].to(device, dtype=torch.long)
            targets = d["labels"].to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask)
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return fin_outputs, fin_targets


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
bert_model = transformers.AutoModel.from_pretrained(model_name)

def trainer(config):
        train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'],train,valid)
        train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = Classifier(n_labels, config['dropout'], bert_model=bert_model)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr = config['learning_rate'])
        n_epochs = config['epochs']

        for epoch in tqdm(range(n_epochs),desc='Loading:',disable=True):

            train_loss = train_fn(train_data_loader, model, optimizer, device) #, scheduler)
            preds, labels = eval_fn(valid_data_loader, model, device)

            res_metrics = log_metrics(preds, labels)
            precision = res_metrics["precision"]
            recall = res_metrics["recall"]
            f1 = res_metrics["f1"]
            avg_train_loss = train_loss / len(train_data_loader)

            print("\nF1-score: ", f1, "Average Train loss: ", avg_train_loss)

            print("\n--------- Epoch {} finished ---------\n".format(epoch))


            if epoch == config["epochs"]-1:
                torch.save(model.state_dict(),
                           f'{path}-epoch_{epoch}-lr_{config["learning_rate"]}-batch_{config["batch_size"]}-drop_{config["dropout"]}-maxlen_{config["tokenizer_max_len"]}-f1_{round(f1, 5)}-{pst_now.strftime("%Y-%m-%d_%H-%M-%S")}.pt' )



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer(config)

100%|██████████| 134/134 [01:30<00:00,  1.48it/s]
17it [00:03,  4.72it/s]



F1-score:  0.8816326530612245 Average Train loss:  0.4967743122755592

--------- Epoch 0 finished ---------



100%|██████████| 134/134 [01:29<00:00,  1.49it/s]
17it [00:03,  4.69it/s]



F1-score:  0.8816326530612245 Average Train loss:  0.4877442367263694

--------- Epoch 1 finished ---------



100%|██████████| 134/134 [01:30<00:00,  1.49it/s]
17it [00:03,  4.52it/s]



F1-score:  0.8816326530612245 Average Train loss:  0.48588270838580916

--------- Epoch 2 finished ---------



100%|██████████| 134/134 [01:29<00:00,  1.49it/s]
17it [00:03,  4.67it/s]



F1-score:  0.8816326530612245 Average Train loss:  0.4928605260688867

--------- Epoch 3 finished ---------



100%|██████████| 134/134 [01:29<00:00,  1.49it/s]
17it [00:03,  4.69it/s]



F1-score:  0.8816326530612245 Average Train loss:  0.4881835111708783

--------- Epoch 4 finished ---------

