<a href="https://colab.research.google.com/github/HandanYU/Rumour-detection/blob/handan/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, BertModel

In [195]:
class TweetDataset(Dataset):
    def __init__(self, data_type, max_seq_len):
        self.max_seq_len = max_seq_len
        # read pre-processed data
        self.tweet_df = pd.read_csv(f'{data_type}_tweet_df.csv', usecols=['text', 'reply_text', 'label'])
        self.statistic_df = pd.read_csv(f'{data_type}_stat_feat_df.csv')
        self.tweet_df['text'] = self.tweet_df['text'].replace(np.nan, '')
        self.tweet_df['reply_text'] = self.tweet_df['reply_text'].replace(np.nan, '')
        # define tokenizer
        self.tokenizer = DistilBertTokenizerFast.from_pretrained("bert-base-uncased")
    def __len__(self):
        return self.tweet_df.shape[0]
    def __getitem__(self, idx):
        source_token_mask = self.tokenizer(self.tweet_df.iloc[idx]['text'], truncation=True, padding='max_length', max_length=self.max_seq_len)
        source_token, source_mask = torch.tensor(source_token_mask['input_ids']), torch.tensor(source_token_mask['attention_mask'])
        pair_token_mask = self.tokenizer(self.tweet_df.iloc[idx]['text'], self.tweet_df.iloc[idx]['reply_text'], truncation='only_second', padding='max_length', max_length=self.max_seq_len)
        pair_tokens_tensor, pair_mask_tensor = torch.tensor(pair_token_mask['input_ids']), torch.tensor(pair_token_mask['attention_mask'])
        return source_token, source_mask, pair_tokens_tensor, pair_mask_tensor, self.tweet_df.iloc[idx]['label'], torch.tensor(self.statistic_df.iloc[idx])

In [None]:
train_loader = DataLoader(TweetDataset('train', 200), shuffle=True, batch_size=20, drop_last=True)
dev_loader = DataLoader(TweetDataset('dev', 200), shuffle=True, batch_size=20, drop_last=True)

In [194]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
import time
class TweetEmbedding(nn.Module):
  def __init__(self, max_seq_len):
    super(TweetEmbedding, self).__init__()
    self.embedding_layer = BertModel.from_pretrained('bert-base-uncased')
    self.ffnn = nn.Sequential(nn.Linear(2 * max_seq_len, 128),
                              nn.Linear(128, 64),
                              nn.Softmax())
  def forward(self, pair_seq_tokens, pair_attn_mask, source_tokens, source_attn_mask):
    print(pair_seq_tokens.device, pair_attn_mask.device, source_tokens.device, source_attn_mask.device)
    pair_out = self.embedding_layer(pair_seq_tokens, pair_attn_mask).last_hidden_state[:, 0] #
    source_out = self.embedding_layer(source_tokens, source_attn_mask).last_hidden_state[:, 0]
    out = torch.cat([pair_out, source_out])
    print(out.shape)
    out = self.ffnn(out)
    return out



In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, classifier, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for source_token, source_attn_masks, pair_token, pair_mask, labels, stat_feat in dataloader:
            source_token, source_attn_masks, pair_token, pair_mask, labels, stat_feat = source_token.cuda(gpu), source_attn_masks.cuda(gpu), pair_token.cuda(gpu), pair_mask.cuda(gpu), labels.cuda(gpu), stat_feat.cuda(gpu)
            # seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            embedding = net(source_token, source_attn_masks, pair_token, pair_mask)
            out = torch.cat(embedding, stat_feat)
            pred = classifier(out, labels)
            mean_loss += criterion(pred, labels.float()).item()
            mean_acc += get_accuracy_from_logits(pred, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [184]:

from sklearn.naive_bayes import GaussianNB
def train(classifier, criterion, optimizer, train_loader, dev_loader, max_eps, gpu):
    best_acc = 0
    st = time.time()

    for ep in range(max_eps):
        classifier.train()
        for iter, (pair_seq_tokens, pair_attn_mask, 
                    source_tokens, source_attn_mask, 
                    label, stat_df) in enumerate(train_loader):
            # Clear gradients
            optimizer.zero_grad()
            # Converting these elements to cuda tensors
            pair_seq_tokens, pair_attn_mask, source_tokens, source_attn_mask, label, stat_df = pair_seq_tokens.cuda(gpu), pair_attn_mask.cuda(gpu), source_tokens.cuda(gpu), source_attn_mask.cuda(gpu), label.cuda(gpu), stat_df.cuda(gpu)
            # seq_tokens, atten_mask, label = seq_tokens.cuda(gpu), atten_mask.cuda(gpu), label.cuda(gpu)
            # Get prediction
            txt_feat = classifier(pair_seq_tokens, pair_attn_mask, source_tokens, source_attn_mask)
            feat = torch.cat(txt_feat, stat_df)
            # Naive Bayes
            NB = GaussianNB()
            pred = NB.fit(feat, label)
            # Get loss
            loss = criterion(pred.squeeze(-1), label.float())
            # Backpropagating the gradients
            loss.backward()
            optimizer.step()

            if iter % 100 == 0:
                acc = get_accuracy_from_logits(pred, label)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(iter, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()
        # evaluate on dev-data
        dev_acc, dev_loss = evaluate(classifier, NB, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(classifier.state_dict(), 'sstcls_{}.dat'.format(ep))
