In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, BertModel

In [3]:
train_stat = pd.read_csv('./data/train_stat_feat_df.csv')
dev_stat = pd.read_csv('./data/dev_stat_feat_df.csv')
dev_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweet_id                  522 non-null    int64  
 1   reply_reply_count         522 non-null    float64
 2   reply_like_count          522 non-null    float64
 3   reply_retweet_count       522 non-null    float64
 4   reply_quote_count         522 non-null    float64
 5   reply_possibly_sensitive  522 non-null    float64
 6   reply_has_url             522 non-null    float64
 7   reply_mentioned_url_num   522 non-null    float64
 8   reply_id_num              522 non-null    float64
 9   reply_isweekday           522 non-null    float64
 10  reply_senti_score         522 non-null    float64
 11  reply_count               522 non-null    float64
 12  like_count                522 non-null    float64
 13  retweet_count             522 non-null    float64
 14  quote_coun

In [4]:
train_tweet = pd.read_csv('./data/train_tweet_df.csv')
dev_tweet = pd.read_csv('./data/dev_tweet_df.csv')
dev_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id    522 non-null    int64 
 1   text        522 non-null    object
 2   created_at  522 non-null    object
 3   user_id     522 non-null    int64 
 4   tweet_id.1  522 non-null    int64 
 5   label       522 non-null    int64 
 6   reply       522 non-null    object
 7   reply_text  518 non-null    object
dtypes: int64(4), object(4)
memory usage: 32.8+ KB


In [7]:
class TweetDataset(Dataset):
    def __init__(self, data_type, maxlen):
        self.maxlen = maxlen
        # read pre-processed data
        self.tweet_df = pd.read_csv(f'./data/{data_type}_tweet_df.csv', usecols=['text', 'reply_text', 'label'])
        self.statistic_df = pd.read_csv(f'./data/{data_type}_stat_feat_df.csv')
        self.tweet_df['text'] = self.tweet_df['text'].replace(np.nan, '')
        self.tweet_df['reply_text'] = self.tweet_df['reply_text'].replace(np.nan, '')
        # define tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    def __len__(self):
        return self.tweet_df.shape[0]
    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = ['[CLS]'] + + self.tweet_df.iloc[index]['text'] + ['[SEP]'] + self.tweet_df.iloc[index]['reply_text']
        label = self.tweet_df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = [1 if token != '[PAD]' else 0 for token in tokens]
        return tokens_ids_tensor, attn_mask, label

In [8]:
train_loader = DataLoader(TweetDataset('train', 200), shuffle=True, batch_size=20, drop_last=True)
dev_loader = DataLoader(TweetDataset('dev', 200), shuffle=True, batch_size=20, drop_last=True)

Downloading: 100%|██████████| 208k/208k [00:02<00:00, 85.7kB/s] 
Downloading: 100%|██████████| 426k/426k [00:03<00:00, 109kB/s]  


In [None]:
class RumorClassifier(nn.Module):

    def __init__(self):
        super(RumorClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        self.ffnn = nn.Sequential(nn.Linear(791,128),
                                  nn.ReLU(),
                                  nn.Dropout(0.3),
                                 nn.Linear(128,64),
                                  nn.ReLU(),
                                  nn.Dropout(0.3),
                                  nn.Linear(64,1),
                                  nn.Sigmoid()
                                 )

    def forward(self, seq, attn_masks, seg, stats):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]
        
        x = torch.cat((cls_rep,stats),dim=1)
        #Feeding cls_rep to the classifier layer
        logits = self.ffnn(x)

        return logits

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = logits.unsqueeze(-1)
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, device):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    #[FOR TPU] Using ParalellLoader
    dataloader = pl.ParallelLoader(dataloader, [device])
    dataloader = dataloader.per_device_loader(device)

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            #[FOR GPU] Converting these to cuda tensors
            #seq, attn_masks, labels = seq.cuda(device), attn_masks.cuda(device), labels.cuda(device)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, device):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        # [FOR TPU] Using ParalellLoader
        train_loader = DataLoader(TweetDataset('train', 200), shuffle=True, batch_size=20, drop_last=True)
        dev_loader2 = pl.ParallelLoader(dev_loader, [device])
        dev_loader2 = dev_loader2.per_device_loader(device)
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  

            #[FOR GPU] Converting these to cuda tensors
            #seq, attn_masks, labels = seq.cuda(device), attn_masks.cuda(device), labels.cuda(device)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #[FOR GPU] Optimization step
            #opti.step()

            #[FOR TPU] Optimization step
            xm.optimizer_step(opti)

            if it % 10 == 0:
                #Please remove [xla:{}] and xm.get_ordinal() if you want to run with GPU
                acc = get_accuracy_from_logits(logits, labels)
                print("[xla:{}] Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(xm.get_ordinal(), it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()
     
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader2, device)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format( best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))