In [None]:
import os
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, classification_report

# if cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# randomness
# seed = 0
# torch.manual_seed(seed)
# np.random.seed(seed)
# random.seed(seed)

# torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

# Dataloader

In [None]:
class TwitterDataset(Dataset):
  def __init__(self, folder, hours):
    self.folder = folder
    self.hours = hours
    self.embeddings = {}
    self.ids = {}
    self.labels = {}
    self._labels = {}
    self.eid_to_ids = {}
    self.label_to_int = {'true':0, 'false':1, 'non-rumor':2, 'unverified':3}

    self._load_labels()
    self._load_embeddings()
    self._get_norms()

    # sanity check
    assert len(self.embeddings) == len(self.labels)

  def __len__(self):
    return len(self.embeddings)

  def __getitem__(self, idx):
    return self.embeddings[idx], self.labels[idx]

  def _load_embeddings(self):
    """Read embedings from files"""
    files = os.listdir(self.folder)

    for idx, file in enumerate(files):
      # save embeddings and id
      emb = np.load(self.folder + file)
      emb = torch.from_numpy(emb).float()
      if len(emb) == 0:
        print('{} vacio'.format(file))
        continue

      # filter by date
      emb = emb[emb[:, -1] <= (60.0 * self.hours)]
      self.embeddings[idx] = emb

      eid = file.split('.')[0]
      self.ids[idx] = eid
      self.labels[idx] = self._labels[eid]
      self.eid_to_ids[eid] = idx

  def _load_labels(self):
    with open('label.txt') as file:
      for line in file:
        line = line.strip().split(':')
        self._labels[line[1]] = torch.tensor([self.label_to_int[line[0]]])
        #if line[0] in ['true','false']:
        #  self._labels[line[1]] = torch.tensor([1]) if line[0]=='true' \
        #                                            else torch.tensor([0])
                                      

  def _get_norms(self):
    features = torch.zeros((1,12))
    for claim in self.embeddings.values():
      features = torch.cat([features, claim[:,300:]], dim=0)
      features = features[1:] # delete row zero
    for id, claim in self.embeddings.items():
      self.embeddings[id][:,300:] = (self.embeddings[id][:,300:] - torch.mean(features,dim=0))/torch.std(features, dim=0) 

In [None]:
# load dataloader
dataset = TwitterDataset('embeddings/', 24.0)
print("Data loaded")

# load ids
import json
ids = json.load('data_ids.json')
train_ids, dev_ids, test_ids = ids['train'], ids['dev'], ids['test']

# Model

In [None]:
class Attention(nn.Module):
  """Attention nn module that is responsible for computing the alignment scores."""
  def __init__(self, method, hidden_size):
    super(Attention, self).__init__()
    self.method = method
    self.hidden_size = hidden_size

    # Define attention layer
    if self.method == 'general':
      self.attention = nn.Linear(self.hidden_size * 2, self.hidden_size * 2)
    elif self.method == 'concat':
      self.attention = nn.Linear(self.hidden_size * 4, self.hidden_size * 2)
      self.v = nn.Parameter(torch.FloatTensor(self.hidden_size * 2))

  def forward(self, hidden, encoder_out):
    '''
      input:
        hidden: [hidden_size]
        encoder_out: [len-1, hidden_size]
      output:
        weights: (1, encoder_out)
    '''
    weights = torch.zeros((1, encoder_out.size(0)), device=device)
    for i, out in enumerate(encoder_out):
      weights[:,i] = self._score(hidden, out)
    return F.softmax(weights, dim=1)


  def _score(self, hidden, encoder_output):
    """Calculate the relevance of a particular rnn output in respect to the prediction."""
    if self.method == 'dot':
      weight = hidden.dot(encoder_output)
    elif self.method == 'general':
      weight = self.attention(encoder_output)
      weight = hidden.T.dot(weight)
    elif self.method == 'concat':
      weight = torch.tanh(self.attention(torch.cat((hidden, encoder_output),0)))
      weight = self.v.T.dot(weight)
    return weight

class Encoder(nn.Module):
  def __init__(self, input_size, hidden_size, bi_d=True, dropout=0, num_layers=1):
    super(Encoder, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.bi_d = bi_d

    self.bi_gru = nn.GRU(
      input_size, 
      hidden_size, 
      bidirectional=bi_d,
      dropout=dropout,
      num_layers=num_layers
    )

  def forward(self, x, hidden=None):
    hidden = torch.zeros((1+int(self.bi_d))*self.num_layers, x.size(1), self.hidden_size).to(device)
    output, hidden = self.bi_gru(x, hidden)
    return output, hidden

class RumorModel(nn.Module):
  def __init__(self, input_size=300, hidden_size=32, bi_d=True, dropout=0.0, method='dot'):
    super(RumorModel, self).__init__()
    
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bi_d = bi_d

    self.encoder = Encoder(
      input_size,
      hidden_size,
      num_layers=1
    )
    self.attention = Attention(
      method,
      hidden_size
    )

    self.dropout = nn.Dropout(p=dropout)
    self.f1 = nn.Linear(hidden_size * 4, 32)
    self.f2 = nn.Linear(32, 2)
    self.activation = F.relu

    self.user = nn.Linear(12, 32)
    self.user2 = nn.Linear(32, 64)

  def forward(self, x=None):
    u = self.dropout(self.activation(self.user(x[:, 300:])))
    u = self.activation(self.user2(u))
    x = torch.cat((x[:, :300], u), 1)

    # encoder
    hidden = None
    out, _ = self.encoder(x.unsqueeze(0), hidden)
    h = out[0, 0, :]
    hs = out[0, 1: ,:]

    # attention module
    align = self.attention(h, hs)
    context = torch.bmm(align.unsqueeze(0), hs.unsqueeze(0))
    out = torch.cat((context[0], h.unsqueeze(0)), 1)

    # linear layer
    out = self.activation(self.f1(out))
    out = self.dropout(out)
    out = self.f2(out)
    return out

In [None]:
def train(net, dataset, train_ids, criterion, optimizer):
  total_loss = []

  # start training
  net.train()
  for idx in train_ids:
    x, y = dataset[idx]

    # Get data to cuda if possible
    x = x.to(device=device)
    y = y.to(device=device)
    optimizer.zero_grad()
    
    # forward
    out = net(x)
    loss = criterion(out, y)

    # backward and opt step
    loss.backward()
    optimizer.step()

    # save loss
    total_loss.append(loss.item())
  
  return np.mean(total_loss)

def dev(net, data_dev, dev_ids, criterion):
  total_loss = []
  net.eval()
  with torch.no_grad():
    for idx in dev_ids:
      x, y = data_dev[idx]

      # Get data to cuda if possible
      x = x.to(device=device)
      y = y.to(device=device)

      # forward
      out = net(x)
      loss = criterion(out, y)

      # save loss
      total_loss.append(loss.item())
  
  return np.mean(total_loss)


def test(net, data_test, test_ids, epoch):
  y_pred = []
  y_true = []
  net.eval()
  with torch.no_grad():
    for idx in test_ids:  
      x, y = data_test[idx]

      x = x.to(device=device)
      y = y.to(device=device)

      out = net(x)
      _, y_hat = torch.max(out.data, 1)
      y_true.append(y.item())
      y_pred.append(y_hat.item())


  return accuracy_score(y_true, y_pred),
   recall_score(y_true, y_pred),
   precision_score(y_true, y_pred),
   f1_score(y_true, y_pred)

# Main routine

In [None]:
def main(args):

  # parse args
  epochs = args['epoch']
  learning_rate = args['lr']
  input_size = args['input_size']
  hidden_size = args['hidden_size']
  att_method = args['att_method']
  dropout = args['dropout']
  weight_decay = args['weight_decay']

  if device.type == 'cuda':
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

  # define model
  net = RumorModel(
    input_size,
    hidden_size,
    dropout=dropout
  ).to(device)
  print("-Model created")

  # loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(
    net.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
  )

  loss = 0
  train_losses = []
  dev_losses = []
  all_f1 = []
  all_acc = []
  all_rec = []
  all_pre = []
  

  for epoch in range(epochs):
    print("----------Epochs {}----------".format(epoch))
    random.shuffle(train_ids)
    start = time.time()

    print('-Training')
    loss_t = train(net, dataset, train_ids, criterion, optimizer)
    loss_d = dev(net, dataset, dev_ids, criterion)
    print('train loss:{:.4f}, dev loss:{:.4f}\n-Validation'.format(loss_t, loss_d))

    acc, rec, pre, f1 = test(net, dataset, test_ids, epoch)
    print('acc:{:.4f}, rec:{:.4f}, pre:{:.4f}, f1:{:.4f}'.format(acc, rec, pre, f1))
    print('Time epochs:{:.4f}'.format(time.time()-start, loss))

    train_losses.append(loss_t)
    dev_losses.append(loss_d)
    all_f1.append(f1)
    all_acc.append(acc)
    all_rec.append(rec)
    all_pre.append(pre)
  torch.save(net, 'net.pt')

  return train_losses, dev_losses, all_f1, all_acc, all_rec, all_pre


if __name__ == '__main__':
  args = {}
  args['epoch'] = 100
  args['lr'] = 0.0001
  args['input_size'] = 364
  args['hidden_size'] = 32
  args['att_method'] = 'dot'
  args['dropout'] = 0.2
  args['weight_decay'] = 0

  trainloss, devloss, allf1, allacc, allrec, allpre = main(args)