# NLP fake news challenge
## Giovanni Spadaro

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pytz, datetime
import os
import re
import random
import pandas as pd
import numpy as np
from sklearn import metrics
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("book")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset
import transformers
from transformers import get_linear_schedule_with_warmup #, AdamW

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package dependency_treebank is already up-to-date!
[nltk_data]    | Downloadi

In [4]:
seed = 42

# https://wandb.ai/sauravmaheshkar/RSNA-MICCAI/reports/How-to-Set-Random-Seeds-in-PyTorch-and-Tensorflow--VmlldzoxMDA2MDQy
def set_seed(seed: int = seed) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed()

Random seed set as 42


In [20]:
mapping = {
    0: 'true',
    1: 'false'
}

config = {
    'learning_rate': 5e-4,
    'batch_size': 16,
    'epochs': 5,
    'dropout': 0.45,
    'tokenizer_max_len': 128
}

device = torch.device(0 if torch.cuda.is_available() else 'cpu')
print("Running on " + str(device))


#model_name = "bert-base-uncased"
#model_name = "microsoft/deberta-v2-xlarge-mnli" # Doesn't fit in memory
model_name = "roberta-base"
#model_name = "distilbert-base-uncased"


GOLD_LABEL = 'label'
TEXT = 'News'
path = "/content/drive/MyDrive/Challenge NLP/models/" + model_name.replace("/", "-")
n_labels = len(mapping)

Running on cuda:0


In [21]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
bert_model = transformers.AutoModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Testing

In [22]:
def preprocessing(df):

    # Remove half-true
    #df = df[df['label'].isin([0,2,3,4,5])]

    # Remove half-true, mostly-true and barely-true
    df = df[df['label'].isin([0,3,5])]


    # Labels mapping:
    # barely-true -> false
    # pants-fire -> true
    # mostly-true -> true
    # half-true -> true
    df['label'] = df['label'].map({5: 3, 4: 0, 2: 3, 1: 3})


    # Text cleaning
    stop_words = set(stopwords.words('english'))

    for index, row in df.iterrows():
        # To lower case
        df.loc[index, "statement"] = df.loc[index, "statement"].lower()

        # Remove punctuation
        df.loc[index, "statement"] = re.sub(r'[^\w\s]', '', df.loc[index, "statement"])

        # Remove stopwords
        word_tokens = word_tokenize(df.loc[index, "statement"])
        filtered_sentence = [w for w in word_tokens if w not in stop_words]
        df.loc[index, "statement"] = ' '.join(filtered_sentence)

        # Remove number assuming a fake or true news is not usually determined by the numbers presented
        df.loc[index, "statement"] = re.sub("\d+", " ", df.loc[index, "statement"])

        # Remove extra spaces
        df.loc[index, "statement"] = re.sub("\s+", " ", df.loc[index, "statement"])



    df['label'] = ['true' if ele == 3 else 'false' for ele in df['label']]
    return df.dropna().reset_index()


In [38]:
def mapping_dataset(dataset, mapping):
    for index,row in dataset.iterrows():
        number_label = [k for label in row[GOLD_LABEL].split(',') for k,v in mapping.items() if label.strip() == v]
        dataset.loc[index, GOLD_LABEL] = str(number_label)

    return dataset

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df)), desc='Loading:',disable=True):
        temp = [0]*n_labels
        label_indices = list(df.iloc[i][GOLD_LABEL][1:-1].split(', '))
        for index in label_indices:
            temp[int(index)] = 1
        one_hot_encoding.append(temp)

    return pd.DataFrame(one_hot_encoding)

In [55]:
class LiarDataset:
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        #self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        #label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                         None,
                                         add_special_tokens=True,
                                         max_length=self.max_len,
                                         padding="max_length",
                                         truncation=True,
                                         )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long)#,
            #"labels": torch.tensor(label, dtype=torch.long)
        }

In [64]:
class Classifier(nn.Module):
    def __init__(self, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes) # bert-base-uncased, roberta, distilbert
        #self.out = nn.Linear(1536, config["batch_size"]*config["tokenizer_max_len"]) # microsoft/deberta-v2-xlarge-mnli


    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"] # bert-base-uncased, roberta
        #output_1 = self.bert(ids, attention_mask=mask)["last_hidden_state"] # microsoft/deberta-v2-xlarge-mnli, distilbert
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [65]:
def build_test_dataset(tokenizer_max_len, test):
    return LiarDataset(list(test[TEXT]), tokenizer, tokenizer_max_len)

def build_test_dataloader(test_dataset, batch_size):
    return DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()

    labels = labels.argmax(axis=1)
    pred_f1 = preds.argmax(axis = 1)

    return {"precision": metrics.precision_score(labels, pred_f1),
            "recall": metrics.recall_score(labels, pred_f1),
            "f1": metrics.f1_score(labels, pred_f1)}

In [70]:
def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader)):
            ids = d["ids"].to(device, dtype=torch.long)
            mask = d["mask"].to(device, dtype=torch.long)
            #targets = d["labels"].to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask)
            #fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return fin_outputs#, fin_targets

## Data loading

In [71]:
#dataset = load_dataset("liar")
#test_data = dataset["test"]
#test = preprocessing(pd.DataFrame(data=test_data))

data_path = "/content/drive/MyDrive/Challenge NLP/data/news test set.tsv"

test = pd.read_csv(data_path, sep="\t")


In [72]:
test_dataset = build_test_dataset(config['tokenizer_max_len'], test)
test_dataloader = build_test_dataloader(test_dataset, config['batch_size'])


## Model load and prediction

In [78]:
model_path = "/content/drive/MyDrive/Challenge NLP/models/roberta-base-epoch_4-lr_0.0005-batch_34-drop_0.45-maxlen_128-f1_0.88163-2024-01-15_21-56-05.pt"

print(device)
if device == "cpu":
    model = torch.load(model_path)
else:
    model = Classifier(n_labels, config['dropout'], bert_model=bert_model)
    model.load_state_dict(torch.load(model_path))
    model.to(device)


preds = eval_fn(test_dataloader, model, device)
preds = [el.cpu().detach().numpy() for el in preds]

preds_path = "/content/drive/MyDrive/Challenge NLP/data/final_preds.txt"

with open(preds_path, "w+") as f:
    for el in preds:
        if el[0] > el[1]:
            f.write("Non-fake")
        else:
            f.write("Fake")

        f.write("\n")

print(preds)

cuda:0


1it [00:00,  6.68it/s]

[array([0.17982668, 0.81665355], dtype=float32), array([0.17982663, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982663, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32), array([0.17982666, 0.81665355], dtype=float32)]



