In [1]:
import torch
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from IPython.display import display
from transformers import BertTokenizer, BertModel
import os
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import nltk
from nltk.corpus import stopwords
import pickle
import scipy
os.environ['KMP_DUPLICATE_LIB_OK']='True'  # Required to make some modules work together with MacOS

In [None]:
train_body = pd.read_csv('tr_b.csv') # Loads datasets stored locally (these are just the datasets from the fake news challenge website)
train_stances = pd.read_csv('tr_s.csv')
test_body = pd.read_csv('te_b.csv')
test_stances = pd.read_csv('te_s.csv')

punctuation = ['"', "'", '.', '-', '!', '?', '#', '/', ':', ';', '(', ')', '*', '&', '@', '_', ',', '’', '”', '“', '—', '–', '[', ']']
stop_words = list(stopwords.words('english')) # List of punctuation & stop words to be removed

for i in range(len(train_body)):  # Removing punctuation & stop words from headlines & article bodies
    train_body['articleBody'][i] = ''.join(filter(lambda a: a not in punctuation, train_body['articleBody'][i]))
    train_body['articleBody'][i] = ' '.join(filter(lambda a: a.lower() not in stop_words, train_body['articleBody'][i].split()))
for i in range(len(test_body)):
    test_body['articleBody'][i] = ''.join(filter(lambda a: a not in punctuation, test_body['articleBody'][i]))
    test_body['articleBody'][i] = ' '.join(filter(lambda a: a.lower() not in stop_words, test_body['articleBody'][i].split()))
for i in range(len(train_body)):
    train_stances['Headline'][i] = ''.join(filter(lambda a: a not in punctuation, train_stances['Headline'][i]))
    train_stances['Headline'][i] = ' '.join(filter(lambda a: a.lower() not in stop_words, train_stances['Headline'][i].split()))
for i in range(len(train_body)):
    test_stances['Headline'][i] = ''.join(filter(lambda a: a not in punctuation, test_stances['Headline'][i]))
    test_stances['Headline'][i] = ' '.join(filter(lambda a: a.lower() not in stop_words, test_stances['Headline'][i].split()))

In [None]:
train_corpus = list(train_body['articleBody']) + list(train_stances['Headline']) # Train corpus to train TF-IDF vectoriser on. We don't use words from test set for obvious reasons.

tfIdfVectorizer = TfidfVectorizer()
tfIdfVectorizer.fit(train_corpus)

train_stances['Headline_TFIDF'] = train_stances.apply(lambda row: tfIdfVectorizer.transform([row.Headline]), axis=1) # Applying TF-IDF vectoriser to all our text data
train_body['Body_TFIDF'] = train_body.apply(lambda row: tfIdfVectorizer.transform([row.articleBody]), axis=1) # This returns our vectors in sparse matrix form, so they take up way less space
test_stances['Headline_TFIDF'] = test_stances.apply(lambda row: tfIdfVectorizer.transform([row.Headline]), axis=1)
test_body['Body_TFIDF'] = test_body.apply(lambda row: tfIdfVectorizer.transform([row.articleBody]), axis=1)

In [None]:
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased') # Installing bert tokenizer stored locally. Identical to model found at https://huggingface.co/bert-base-uncased

train_stances['Headline_Tokenized'] = train_stances.apply(lambda row: tokenizer.tokenize('[CLS] ' + row.Headline + ' [SEP]'), axis=1) # Tokenizing headlines. We don't limit to 512 tokens here, as no headlines are anywhere enar that long.
train_stances['Headline_Indexed'] = train_stances.apply(lambda row: tokenizer.convert_tokens_to_ids(row.Headline_Tokenized), axis=1) # Converting tokens to IDs.
train_body['Body_Tokenized'] = train_body.apply(lambda row: tokenizer.tokenize('[CLS] ' + row.articleBody)[:511] + ['[SEP]'], axis=1) # When tokenizing bodies, we tokenize full text before cutting off first 511 tokens and adding final [SEP] token.
train_body['Body_Indexed'] = train_body.apply(lambda row: tokenizer.convert_tokens_to_ids(row.Body_Tokenized), axis=1)
test_stances['Headline_Tokenized'] = test_stances.apply(lambda row: tokenizer.tokenize('[CLS] ' + row.Headline + ' [SEP]'), axis=1)
test_stances['Headline_Indexed'] = test_stances.apply(lambda row: tokenizer.convert_tokens_to_ids(row.Headline_Tokenized), axis=1)
test_body['Body_Tokenized'] = test_body.apply(lambda row: tokenizer.tokenize('[CLS] ' + row.articleBody)[:511] + ['[SEP]'], axis=1)
test_body['Body_Indexed'] = test_body.apply(lambda row: tokenizer.convert_tokens_to_ids(row.Body_Tokenized), axis=1)

In [None]:
train_body.to_pickle(os.getcwd() + '/train_body.pkl') # Pickling all our tokens so we don't have to do preprocessing steps every time while testing.
test_body.to_pickle(os.getcwd() + '/test_body.pkl') # Running this will produce 4 files totalling around 50MB in size.
train_stances.to_pickle(os.getcwd() + '/train_stances.pkl')
test_stances.to_pickle(os.getcwd() + '/test_stances.pkl')

In [2]:
train_body = pd.read_pickle('train_body.pkl') # Load all tokens from pickles. For resuming testing from this point without bothering with lengthy preprocessing.
train_stances = pd.read_pickle('train_stances.pkl')
test_body = pd.read_pickle('test_body.pkl')
test_stances = pd.read_pickle('test_stances.pkl')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertModel.from_pretrained('./bert-base-uncased',
                                  output_hidden_states = True,
                                  ) # Again using BERT model from https://huggingface.co/bert-base-uncased for embeddings.

model.to(device)
model.eval()

def get_BERT_embedding(model, indexed_tokens):
    segments_ids = [1] * len(indexed_tokens) # Segment IDs denote which sentence each token is from, as this model can take 2 sentences as input. we treat entire input as 1 sentence for ease, however.
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    outputs = model(tokens_tensor, segments_tensors)
    return outputs[0] # First element of output is final layer from BERT model for each timestep. We will use this as our embedding. Some research shows using concatenation of last 4 layers can give better performance, but would rather keep final model simpler & faster to train and spend more time tuning hyperparameters.

# WARNING: Following piece of code saves all BERT embeddings as tensor files. This creates around 7GB of files and needs folders named train_stances, test_stances, test_body and train_body.

with torch.no_grad():
    i=0
    path = os.getcwd() + '/train_stances/'
    for i in range(len(train_stances)):
        get_BERT_embedding(model, train_stances['Headline_Indexed'][i], device, path + str(i) + '.pt') # We save each headline and body as a seperate tensor. This way we can avoid keeping entire dataset in memory when training model.
    i=0
    path = os.getcwd() + '/test_stances/'
    for i in range(len(test_stances)):
        get_BERT_embedding(model, test_stances['Headline_Indexed'][i], device, path + str(i) + '.pt')
    i=0
    path = os.getcwd() + '/test_body/'
    for i in range(len(test_body)):
        get_BERT_embedding(model, test_body['Body_Indexed'][i], device, path + str(i) + '.pt')
    i=0
    path = os.getcwd() + '/train_body/'
    for i in range(len(train_body)):
        get_BERT_embedding(model, train_body['Body_Indexed'][i], device, path + str(i) + '.pt')

In [None]:
tfidf_train_set = pd.DataFrame(columns=['Vector', 'Label']) # Creating dataframe to hold TF-IDF dataset in.
tfidf_test_set = pd.DataFrame(columns=['Vector', 'Label'])
label_dict = {'unrelated': 0, 'agree': 1, 'disagree': 1, 'discuss': 1}


# We will create the train & test sets by concatenating the vectors of the headline and body. We will store these as scipy sparse matrices to save on space.
tfidf_train_set['Vector'] = train_stances.apply(lambda row: scipy.sparse.csr_matrix(np.concatenate((row['Headline_TFIDF'].todense(), train_body['Body_TFIDF'][train_body.index[train_body['Body ID'] == row['Body ID']][0]].todense()), axis=1).A1), axis=1)
tfidf_test_set['Vector'] = test_stances.apply(lambda row: scipy.sparse.csr_matrix(np.concatenate((row['Headline_TFIDF'].todense(), test_body['Body_TFIDF'][test_body.index[test_body['Body ID'] == row['Body ID']][0]].todense()), axis=1).A1), axis=1)

# Storing labels of the headlines on each row.
tfidf_train_set['Label'] = train_stances.apply(lambda row: label_dict[row['Stance']], axis=1)
tfidf_test_set['Label'] = test_stances.apply(lambda row: label_dict[row['Stance']], axis=1)

# Converting to lists for use with logistic regression model.
tfidf_train_inputs = tfidf_train_set['Vector'].to_list()
train_labels = tfidf_train_set['Label'].to_list()
tfidf_test_inputs = tfidf_test_set['Vector'].to_list()
test_labels = tfidf_test_set['Label'].to_list()

In [None]:
train_body_id_list = train_stances['Body ID'].to_list() # This creates a list of the Body IDs associated with eachheadline, in order.
test_body_id_list = test_stances['Body ID'].to_list()

for i in range(len(train_body_id_list)): # We then use said list to find the index of the relevant bodies. rather than finding them by their IDs each time, we can just access the relevant row in the body dataframe which is faster.
    train_body_id_list[i] = train_body.index[train_body['Body ID'] == train_body_id_list[i]][0]
for i in range(len(test_body_id_list)):
    test_body_id_list[i] = test_body.index[test_body['Body ID'] == test_body_id_list[i]][0]

In [None]:
bert_train_inputs = []
bert_test_inputs = []
for i in range(len(os.listdir(os.getcwd() + '/train_stances'))):
    try: # We will concatenate the BERT headline & body embeddings fro logistic regression model.
        head_file = os.getcwd() + '/train_stances/' + str(i) + '.pt'
        body_file = os.getcwd() + '/train_body/' + str(train_body_id_list[i]) + '.pt' # Our body index list is useful here as the body files are saved by index rather than body ID.
        head = torch.load(head_file)[0][0] # We take only the embedding of the [CLS] token at the start of the input sequence. This token tends to store context information for the entire input.
        body = torch.load(body_file)[0][0] # If we used the entire concatenated input, our embeddings would all be of different lengths which is incompatible with the logistic regression model.
        bert_train_inputs.append(np.array(torch.cat((head, body))))
    except:
        continue
for i in range(len(os.listdir(os.getcwd() + '/test_stances'))):
    try:
        head_file = os.getcwd() + '/test_stances/' + str(i) + '.pt'
        body_file = os.getcwd() + '/test_body/' + str(test_body_id_list[i]) + '.pt'
        head = torch.load(head_file)[0][0]
        body = torch.load(body_file)[0][0]
        bert_test_inputs.append(np.array(torch.cat((head, body))))
    except:
        continue

In [None]:
pickle.dump(tfidf_train_inputs, open('./train_stuff/tfidf_train_inputs.pkl', 'wb')) # Saving inputs again to avoid preprocessing steps. Creates about 600MB of files. 
pickle.dump(tfidf_test_inputs, =open('./train_stuff/tfidf_test_inputs.pkl', 'wb'))
pickle.dump(train_labels, open('./train_stuff/train_labels.pkl', 'wb'))
pickle.dump(test_labels, open('./train_stuff/test_labels.pkl', 'wb'))
pickle.dump(train_body_id_list, open('./train_stuff/train_body_id_list.pkl', 'wb'))
pickle.dump(test_body_id_list, open('./train_stuff/test_body_id_list.pkl', 'wb'))
pickle.dump(bert_train_inputs, open('./train_stuff/bert_train_inputs.pkl', 'wb'))
pickle.dump(bert_test_inputs, open('./train_stuff/bert_test_inputs.pkl', 'wb'))

In [3]:
tfidf_train_inputs = pickle.load(open('./train_stuff/tfidf_train_inputs.pkl', 'rb'))
tfidf_test_inputs = pickle.load(open('./train_stuff/tfidf_test_inputs.pkl', 'rb'))
train_labels = pickle.load(open('./train_stuff/train_labels.pkl', 'rb'))
test_labels = pickle.load(open('./train_stuff/test_labels.pkl', 'rb'))
train_body_id_list = pickle.load(open('./train_stuff/train_body_id_list.pkl','rb'))
test_body_id_list = pickle.load(open('./train_stuff/test_body_id_list.pkl', 'rb'))
bert_train_inputs = pickle.load(open('./train_stuff/bert_train_inputs.pkl', 'rb'))
bert_test_inputs = pickle.load(open('./train_stuff/bert_test_inputs.pkl', 'rb'))
for i in range(len(tfidf_train_inputs)): # TF-IDF inputs need to be unpacked from sparse matrices to numpy arrays for logistic regression model. This requires >=8GB of memory.
    tfidf_train_inputs[i] = np.squeeze(np.asarray(tfidf_train_inputs[i].todense()))
for i in range(len(tfidf_test_inputs)):
    tfidf_test_inputs[i] = np.squeeze(np.asarray(tfidf_test_inputs[i].todense()))

In [5]:
# Using logistic regression model from sklearn. We tried scaling the inputs to a normal distribution first, but it didn't do much to improve performance and increased training time.
tfidf_classifier = LogisticRegression(solver='liblinear')
tfidf_classifier.fit(tfidf_train_inputs, train_labels)
tfidf_preds = tfidf_classifier.predict_proba(tfidf_test_inputs) # We return the probabilities rather than classes to run BCEloss later.

bert_classifier = LogisticRegression(solver='liblinear', max_iter=10000) # max_iter has to be higher here else we reach recursion limit.
bert_classifier.fit(bert_train_inputs, train_labels)
bert_preds = bert_classifier.predict_proba(bert_test_inputs)

In [6]:
tfidf_TP = 0 # To evaluate models, we will compare accuracy, precision, recall and Binary Cross-Entropy loss.
tfidf_FP = 0
tfidf_TN = 0
tfidf_FN = 0
bert_TP = 0
bert_FP = 0
bert_TN = 0
bert_FN = 0
for i in range(len(tfidf_preds)):
    if test_labels[i] == 1:
        if round(tfidf_preds[i][1]) == 1:
            tfidf_TP += 1
        else:
            tfidf_FN += 1
        if round(bert_preds[i][1]) == 1:
            bert_TP += 1
        else:
            bert_FN += 1
    else:
        if round(tfidf_preds[i][0]) == 1:
            tfidf_TN += 1
        else:
            tfidf_FP += 1
        if round(bert_preds[i][0]) == 1:
            bert_TN += 1
        else:
            bert_FP += 1
print('TF-IDF Accuracy: ' + str((tfidf_TP + tfidf_TN)/len(tfidf_preds)))
print('BERT Accuracy: ' + str((bert_TP + bert_TN)/len(tfidf_preds)))
print('TF-IDF Precision: ' + str(tfidf_TP/(tfidf_TP + tfidf_FP)))
print('BERT Precision: ' + str(bert_TP/(bert_TP + bert_FP)))
print('TF-IDF Recall: ' + str(tfidf_TP/(tfidf_TP + tfidf_FN)))
print('BERT Recall: ' + str(bert_TP/(bert_TP + bert_FN)))
tfidf_bce_loss = nn.BCELoss(reduction='mean')(torch.tensor(tfidf_preds)[:, 1], torch.tensor(test_labels, dtype=torch.double)).item()
bert_bce_loss = nn.BCELoss(reduction='mean')(torch.tensor(bert_preds)[:, 1], torch.tensor(test_labels, dtype=torch.double)).item()
print('TF-IDF BCE Loss: ' + str(tfidf_bce_loss))
print('BERT BCE Loss: ' + str(bert_bce_loss))

TF-IDF Accuracy: 0.67953409672215
BERT Accuracy: 0.6159445952858773
TF-IDF Precision: 0.3193979933110368
BERT Precision: 0.29674306393244876
TF-IDF Recall: 0.1351925254813137
BERT Recall: 0.2785956964892412
TF-IDF BCE Loss: 0.6420563919172942
BERT BCE Loss: 0.8968060107975179


In [7]:
class GRU(nn.Module):
    def __init__(self, bert, relevance, hidden_size=256, num_layers=2, dropout=0.1, tfidf_length=0):
        super(GRU, self).__init__()
        self.bert = bert # indicates whether we are using BERT or  TFIDF embeddings.
        self.relevance = relevance # indicates whether we are classifying related/unrelated (True) or agree/disagree/discuss (False).
        
        if bert: # Model for BERT embeddings trains two seperated 2-layered GRUs, one for headlines and one for bodies.
            self.headline_gru = nn.GRU(input_size=768, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
            self.body_gru = nn.GRU(input_size=768, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
            if relevance: # If we are tesing relevance, use a sigmoid function and output one probability of how likely headline is to be relevant.
                self.fc = nn.Sequential(nn.Linear(2*hidden_size, 2*hidden_size), # We concatenate final output from both GRUs, so need input size of 2*hidden_size.
                                        nn.ReLU(),
                                        nn.Linear(2*hidden_size, 2*hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(2*hidden_size, 1),
                                        nn.Sigmoid())
            else: # If we are tesing agree/disagree/discuss, we use a softmax function to output a probability distribution over all 3 options.
                self.fc = nn.Sequential(nn.Linear(2*hidden_size, 2*hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(2*hidden_size, 2*hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(2*hidden_size, 3),
                                        nn.Softmax())
        else: # Only 1 GRU  for TF-IDF embeddings since we input headline then body in sequence.
            self.gru = nn.GRU(input_size=tfidf_length, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
            if relevance:
                self.fc = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(hidden_size, hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(hidden_size, 1),
                                        nn.Sigmoid())
            else:
                self.fc = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(hidden_size, hidden_size),
                                        nn.ReLU(),
                                        nn.Linear(hidden_size, 3),
                                        nn.Softmax(dim=0))

    def forward(self, head, body):
        if self.bert:
            _, head = self.headline_gru(head) # We only care about final output, ignore tensor with first t-1 hidden states.
            _, body = self.body_gru(body)
            pred = self.fc(torch.cat((head[-1], body[-1]), dim=0)) # We concatenate final hidden states from the final GRU layer, hence head[-1] & body[-1]
        else:
            _, x = self.gru(head)
            pred = self.fc(x[-1])
        return pred

class Optimisation: # We use an optimisation class to handle aspects of training as it keeps things neater and more generalisable.
    def __init__(self, model, optimiser):
        self.model = model
        self.optimiser = optimiser
        self.bce = nn.BCELoss(reduction='sum') # Our loss function is just BCE loss as it is very effective for binary classification problems such as this.
    
    def loss_fn(self, x_pred, x_target):
        loss = self.bce(x_pred, x_target)
        return loss
    
    def train_step(self, head, body, target): # Training involves optimiser step
        self.optimiser.zero_grad()
        self.model.train()
        pred = self.model(head, body)
        loss = self.loss_fn(pred, target)
        loss.backward()
        self.optimiser.step()
        return loss.item(), pred
    
    def test_step(self, head, body, target): # Testing does not!
        self.model.eval()
        pred = self.model(head, body)
        loss = self.loss_fn(pred, target)
        return loss.item(), pred

class Dataset(torch.utils.data.Dataset): # Our dataset class is used to handle BERT embeddings. TF-IDF embeddings are just stored in a list, as they take up little memory in scipy.sparse format. 
    def __init__(self, train, stance_ids, labels, body_ids):
        if train: # Separate paths for training & testing mode.
            self.stance_path = os.getcwd() + '/train_stances/'
            self.body_path = os.getcwd() + '/train_body/'
        else:
            self.stance_path = os.getcwd() + '/test_stances/'
            self.body_path = os.getcwd() + '/test_body/'
        self.body_ids = body_ids
        self.stance_ids = stance_ids
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index): # Return tuple of headline embedding, body embedding and label.
        s = torch.load(self.stance_path + str(self.stance_ids[index]) + '.pt')
        b = torch.load(self.body_path + str(self.body_ids[index]) + '.pt')
        l = self.labels[index]
        return s, b, l

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRU(True, True, hidden_size=256, num_layers=2, dropout=0.02).to(device) # BERT model for relevance classifying.
optimiser = Optimisation(model, optim.Adam(model.parameters(), lr=0.00001)) # learning rate of 0.00001 and ADAM optimiser worked best after some brief testing.

# Train & test dataloaders.
train_dataset = Dataset(True, list(range(len(train_labels))), train_labels, train_body_id_list)
test_dataset = Dataset(False, list(range(len(test_labels))), test_labels, test_body_id_list)
train_dataloader = DataLoader(train_dataset, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=True)

In [None]:
for epoch in range(3):
    loss = 0
    i = 0
    for batch in train_dataloader: # Move data to device & feed into optimiser for train step.
        temp, _ = optimiser.train_step(batch[0].squeeze().to(device), batch[1].squeeze().to(device), batch[2].type(torch.float32).to(device))
        loss += temp
        i += 1
        if i % 100 == 0: # Print loss every 100 iterations.
            print('avg loss for iteration ' + str(i + epoch * len(train_dataloader)) + ': ' + str(loss/100))
            loss = 0
    test_loss = 0
    correct = 0
    for batch in test_dataloader: # Evaluate after every epoch
        temp, pred = optimiser.test_step(batch[0].squeeze().to(device), batch[1].squeeze().to(device), batch[2].type(torch.float32).to(device))
        test_loss += temp
        if int(batch[2].squeeze()) == round(pred.item()): # Compare label to predicted label.
            correct += 1
    print('BERT avg loss for epoch ' + str(epoch) + ': ' + str(test_loss/len(test_dataloader)))
    print('BERT correct prediction proportion for epoch ' + str(epoch) + ': ' + str(correct/len(test_dataloader)))
        

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tfidf_length = tfidf_train_inputs[0].shape[0]//2 # Finds tf-idf vector length for model parameters.
model = GRU(False, True, hidden_size=256, num_layers=2, dropout=0.02, tfidf_length=tfidf_length).to(device)
optimiser = Optimisation(model, optim.Adam(model.parameters(), lr=0.00001))

In [None]:
for epoch in range(1): # Works very similar to BERT training loop, except inputs are just taken from tfidf inputs.
    loss = 0
    for i in range(len(tfidf_train_inputs)): # Reshape inputs as they are still in concatenated head-body form from logistic regression inputs.
        temp, pred = optimiser.train_step(torch.reshape(torch.tensor(tfidf_train_inputs[i], dtype=torch.float32, device=device), (2, tfidf_length)), 0, torch.tensor([train_labels[i]], dtype=torch.float32, device=device))
        loss += temp
        if i % 100 == 0:
            print('avg loss for iteration ' + str(i + epoch * len(tfidf_train_inputs)) + ': ' + str(loss/100))
            loss = 0
    test_loss = 0
    correct = 0
    for i in range(len(tfidf_test_inputs)):
        temp, pred = optimiser.test_step(torch.reshape(torch.tensor(tfidf_test_inputs[i], dtype=torch.float32, device=device), (2, tfidf_length)), 0, torch.tensor([test_labels[i]], dtype=torch.float32, device=device))
        test_loss += temp
        if int(test_labels[i]) == round(pred.item()):
            correct += 1
    print('TFIDF avg loss for epoch ' + str(epoch) + ': ' + str(test_loss/len(tfidf_test_inputs)))
    print('TFIDF correct prediction proportion for epoch ' + str(epoch) + ': ' + str(correct/len(tfidf_test_inputs)))

In [None]:
label_dict = {'agree': 0, 'disagree': 1, 'discuss': 2} # Construction of datasets for aggree/disagree/discuss classification
classifier_train_stance_ids = []
classifier_train_labels = []
classifier_train_body_id_list = []
classifier_tfidf_train_inputs = []
classifier_test_stance_ids = []
classifier_test_labels = []
classifier_test_body_id_list = []
classifier_tfidf_test_inputs = []
for i in range(len(train_stances)): # Loop through all headlines looking for only relevant ones
    if train_stances['Stance'][i] in ['agree', 'disagree', 'discuss']:
        classifier_train_stance_ids.append(i) # Save the 'id' (this is just the index as this is how the pytorch files were named)
        classifier_train_labels.append([0, 0, 0]) # Prepared list for label
        classifier_train_labels[-1][label_dict[train_stances['Stance'][i]]] = 1 # Put 1 in correct list index
        classifier_train_body_id_list.append(train_body.index[train_body['Body ID'] == train_stances['Body ID'][i]][0]) # save 'id' of body (again index in dataframe)
        classifier_tfidf_train_inputs.append(tfidf_train_inputs[i]) # Append the tf-idf input, too
for i in range(len(test_stances)):
    if test_stances['Stance'][i] in ['agree', 'disagree', 'discuss']:
        classifier_test_stance_ids.append(i)
        classifier_test_labels.append([0, 0, 0])
        classifier_test_labels[-1][label_dict[test_stances['Stance'][i]]] = 1
        classifier_test_body_id_list.append(test_body.index[test_body['Body ID'] == test_stances['Body ID'][i]][0])
        classifier_tfidf_test_inputs.append(tfidf_test_inputs[i])
        

In [None]:
pickle.dump(classifier_train_stance_ids, open('./train_stuff/classifier_train_stance_ids.pkl', 'wb')) # Once again, saving them to save on preprocessing time.
pickle.dump(classifier_train_labels, open('./train_stuff/classifier_train_labels.pkl', 'wb')) # Creates ~50MB of files.
pickle.dump(classifier_train_body_id_list, open('./train_stuff/classifier_train_body_id_list.pkl', 'wb'))
pickle.dump(classifier_test_stance_ids, open('./train_stuff/classifier_test_stance_ids.pkl', 'wb'))
pickle.dump(classifier_test_labels, open('./train_stuff/classifier_test_labels.pkl', 'wb'))
pickle.dump(classifier_test_body_id_list, open('./train_stuff/classifier_test_body_id_list.pkl', 'wb'))
pickle.dump(classifier_tfidf_test_inputs, open('./train_stuff/classifier_tfidf_test_inputs.pkl', 'wb'))
pickle.dump(classifier_tfidf_train_inputs, open('./train_stuff/classifier_tfidf_train_inputs.pkl', 'wb'))

In [8]:
classifier_train_stance_ids = pickle.load(open('./train_stuff/classifier_train_stance_ids.pkl', 'rb'))
classifier_train_labels = pickle.load(open('./train_stuff/classifier_train_labels.pkl', 'rb'))
classifier_train_body_id_list = pickle.load(open('./train_stuff/classifier_train_body_id_list.pkl', 'rb'))
classifier_test_stance_ids = pickle.load(open('./train_stuff/classifier_test_stance_ids.pkl', 'rb'))
classifier_test_labels = pickle.load(open('./train_stuff/classifier_test_labels.pkl', 'rb'))
classifier_test_body_id_list = pickle.load(open('./train_stuff/classifier_test_body_id_list.pkl', 'rb'))
classifier_tfidf_test_inputs = pickle.load(open('./train_stuff/classifier_tfidf_test_inputs.pkl', 'rb'))
classifier_tfidf_train_inputs = pickle.load(open('./train_stuff/classifier_tfidf_train_inputs.pkl', 'rb'))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRU(True, False).to(device) # GRU for agree/disagree/discuss with BERT inputs
optimiser = Optimisation(model, optim.Adam(model.parameters(), lr=0.00001))

train_dataset = Dataset(True, classifier_train_stance_ids, classifier_train_labels, classifier_train_body_id_list)
test_dataset = Dataset(False, classifier_test_stance_ids, classifier_test_labels, classifier_test_body_id_list)
train_dataloader = DataLoader(train_dataset, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=True)

In [None]:
for epoch in range(3): # Mainly same as before.
    loss = 0
    i = 0
    for batch in train_dataloader:
        temp, pred = optimiser.train_step(batch[0].squeeze().to(device), batch[1].squeeze().to(device), torch.tensor(batch[2], device=device, dtype=torch.float32))
        loss += temp
        i += 1
        if i % 100 == 0:
            print('avg loss for iteration ' + str(i + epoch * len(train_dataloader)) + ': ' + str(loss/100))
            loss = 0
    test_loss = 0
    correct = 0
    for batch in test_dataloader:
        temp, pred = optimiser.test_step(batch[0].squeeze().to(device), batch[1].squeeze().to(device), torch.tensor(batch[2], device=device, dtype=torch.float32))
        test_loss += temp # To check if correct, we find indexes of highest values in label and prediction vectors. These will be equal if prediction is correct.
        if batch[2].index(max(batch[2])) == pred.tolist().index(max(pred.tolist())):
            correct += 1
    print('BERT avg loss for epoch ' + str(epoch) + ': ' + str(test_loss/len(test_dataloader)))
    print('BERT correct prediction proportion for epoch ' + str(epoch) + ': ' + str(correct/len(test_dataloader)))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

classifier_tfidf_length = classifier_tfidf_train_inputs[0].shape[0]//2
model = GRU(False, False, tfidf_length=classifier_tfidf_length).to(device) # GRU for agree/disagree/discuss classification on TF-IDF inputs.
optimiser = Optimisation(model, optim.Adam(model.parameters(), lr=0.00001))

In [None]:
for epoch in range(3):
    loss = 0
    for i in range(len(classifier_tfidf_train_inputs)):
        temp, pred = optimiser.train_step(torch.reshape(torch.tensor(classifier_tfidf_train_inputs[i], dtype=torch.float32, device=device), (2, classifier_tfidf_length)), 0, torch.tensor(classifier_train_labels[i], dtype=torch.float32, device=device))
        loss += temp
        if i % 100 == 0:
            print('avg loss for iteration ' + str(i + epoch * len(classifier_tfidf_train_inputs)) + ': ' + str(loss/100))
            loss = 0
    test_loss = 0
    correct = 0
    for i in range(len(classifier_tfidf_test_inputs)):
        temp, pred = optimiser.test_step(torch.reshape(torch.tensor(classifier_tfidf_test_inputs[i], dtype=torch.float32, device=device), (2, classifier_tfidf_length)), 0, torch.tensor(classifier_test_labels[i], dtype=torch.float32, device=device))
        test_loss += temp
        if classifier_test_labels[i].index(max(classifier_test_labels[i])) == pred.tolist().index(max(pred.tolist())):
            correct += 1
    print('TFIDF avg loss for epoch ' + str(epoch) + ': ' + str(test_loss/len(classifier_tfidf_test_inputs)))
    print('TFIDF correct prediction proportion for epoch ' + str(epoch) + ': ' + str(correct/len(classifier_tfidf_test_inputs)))

In [15]:
# Measuring accuracy, precision, recall & F1-score of best TF-IDF model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tfidf_length = tfidf_train_inputs[0].shape[0]//2
model = GRU(False, True, hidden_size=256, num_layers=2, dropout=0.1, tfidf_length=tfidf_length).to(device)
model.load_state_dict(torch.load('./Model_Checkpoints/0.60 0.726 TFIDF.pt', map_location=device)['model_state_dict'])
model.eval()

true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0
for i in range(len(tfidf_test_inputs)):
    pred = model(torch.reshape(torch.tensor(tfidf_test_inputs[i], dtype=torch.float32, device=device), (2, tfidf_length)), 0)
    if test_labels[i] == 1:
        if round(pred.item()) == 1:
            true_pos += 1
        else:
            false_neg += 1
    else:
        if round(pred.item()) == 1:
            false_pos += 1
        else:
            true_neg += 1
print('Accuracy: ' + str(((true_pos + true_neg)/i)))
precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos+false_neg)
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1-Score: ' + str(2*precision*recall/(precision+recall)))

Accuracy: 0.7260349441208878
Precision: 0.5458673932788374
Recall: 0.08507927519818799
F1-Score: 0.1472137170851194


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Evaluating performance of best models & doing start-to-finish evaluation.

model = GRU(True, True, hidden_size=256, num_layers=2, dropout=0.1).to(device)
model.load_state_dict(torch.load('./Model_Checkpoints/0.58 0.734 BERT.pt', map_location=device)['model_state_dict'])
model.eval()
test_dataset = Dataset(False, list(range(len(test_labels))), test_labels, test_body_id_list)
test_dataloader = DataLoader(test_dataset, shuffle=False)

In [18]:
relevant_ids = [] # Measuring accuracy, precision, recall & f1 score and producing list of predicted relevant IDs for stance classification
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
correct = 0
i = 0
true = []
predicted = []
for batch in test_dataloader:
    pred = model(batch[0].squeeze().to(device), batch[1].squeeze().to(device))
    true.append(3 if batch[2].item() == 0 else -1)
    predicted.append(3 if round(pred.item()) == 0 else -1)
    if round(pred.item()) == 1:
        relevant_ids.append(i)
    if batch[2].item() == 1:
        if round(pred.item()) == 1:
            true_pos += 1
        else:
            false_neg += 1
    else:
        if round(pred.item()) == 1:
            false_pos += 1
        else:
            true_neg += 1
            correct += 1 # This is only correct one here, the 'relevant' ones aren't done with classification yet
    i += 1
print('Accuracy: ' + str(((true_pos + true_neg)/i)))
precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos+false_neg)
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1-Score: ' + str(2*precision*recall/(precision+recall)))

Accuracy: 0.7349388108448431
Precision: 0.5243323442136498
Recall: 0.5002831257078143
F1-Score: 0.5120254998551145


In [19]:
label_dict = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
relevant_body_ids = []
relevant_labels = []
for i in relevant_ids:
    relevant_labels.append([0, 0, 0, 0])
    relevant_labels[-1][label_dict[test_stances['Stance'][i]]] = 1
    relevant_body_ids.append(test_body.index[test_body['Body ID'] == test_stances['Body ID'][i]][0])
for i in range(len(true)):
    if true[i] == -1:
        true[i] = label_dict[test_stances['Stance'][i]]

In [20]:
model = GRU(True, False, hidden_size=128, num_layers=2, dropout=0.1).to(device)
model.load_state_dict(torch.load('./Model_Checkpoints/1.41 0.655 CLASSIFIER_BERT.pt')['model_state_dict'])
model.eval()
test_dataset = Dataset(False, relevant_ids, relevant_labels, relevant_body_ids)
test_dataloader = DataLoader(test_dataset, shuffle=False)

In [21]:
from sklearn.metrics import classification_report # Much easier than doing 4 precision/recall calculations by hand...
i=0
for batch in test_dataloader:
        pred = model(batch[0].squeeze().to(device), batch[1].squeeze().to(device))
        true[relevant_ids[i]] = relevant_labels[i].index(max(relevant_labels[i]))
        predicted[relevant_ids[i]] = pred.tolist().index(max(pred.tolist()))
        if relevant_labels[i].index(max(relevant_labels[i])) == pred.tolist().index(max(pred.tolist())):
            correct += 1
        i += 1
print(correct/len(test_stances))
report = classification_report(true, predicted, target_names=['agree', 'disagree', 'discuss', 'unrelated'])
print(report)

  input = module(input)


0.6937787746428993
              precision    recall  f1-score   support

       agree       0.21      0.26      0.23      1903
    disagree       0.00      0.00      0.00       697
     discuss       0.45      0.45      0.45      4464
   unrelated       0.81      0.83      0.82     18349

    accuracy                           0.69     25413
   macro avg       0.37      0.38      0.37     25413
weighted avg       0.68      0.69      0.69     25413



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
