In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 27.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
import random
import os
import argparse
import time
import datetime
import re
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
from sklearn import model_selection, naive_bayes, svm
from tqdm import tqdm
from collections import defaultdict
import tensorflow_hub as hub

In [None]:


def encode_label(label):
    """
    Convert Rumor(True) to 0 and Truth and Unsure(False) to 1
    """
    if label == 'T' or label == 'U': return 0
    elif label == "F": return 1



In [None]:
# Fix seed for replicability
seed=103
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fe38f4fff90>

In [None]:
def loadFile(X,y):
    """
    Load file and apply preprocessing for BERT model
    """
    # Define tokenizer
    tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

    # Encode sentences to ids
    input_ids = list()
    for sent in tqdm(X):
        encoded_sent = tokenizer.encode(sent.lower(), 
                                        add_special_tokens = True,
                                        truncation = True,
                                        max_length = 128) 
                                        #return_tensors = 'pt')

        input_ids.append(encoded_sent)

    # Pad/truncate sentences
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids,
                                                                maxlen=128,
                                                                dtype='long',
                                                                value=0,
                                                                truncating='post',
                                                                padding='post')

    # Attention Masks
    attention_masks = list()
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    X = torch.tensor(input_ids)

    y = torch.tensor(y)
    attention_masks = torch.tensor(attention_masks)

    return X, y, attention_masks

def makeDataLoader(X, y, attention_masks):
    """
    Make PyTorch iterator
    """
    batch_size = 16

    data = TensorDataset(X, attention_masks, y)
    sampler = RandomSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def f1(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. 
    Reference: https://gist.github.com/SuperShinyEyes/dcc68a08ff8b615442e3bc6a9b55a354
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
def prediction(logits):
  TF= []
  for i in logits:
    if i[0] > i[1]:
      TF += [0]
    else:
      TF += [1]
  return TF
def accurate(TF, labelids):
  score = 0
  for i in range(len(TF)):
    if TF[i] == label_ids[i]:
      score+=1
  return score, len(TF)

In [None]:
model = BertForSequenceClassification.from_pretrained('digitalepidemiologylab/covid-twitter-bert').to(device) #87% F1
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device) # 82% F1
# model = BertForSequenceClassification.from_pretrained('vinai/bertweet-base').to(device) # 75% F1

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp "/content/drive/MyDrive/sampledata1000.csv" .

Mounted at /content/drive
cp: cannot stat '/content/drive/MyDrive/sampledata1000.csv': No such file or directory


In [None]:
parser = argparse.ArgumentParser()

MAX_LEN = 64

df = pd.read_csv(r"sampledata1000.csv")
df['label'] = df['label'].apply(lambda x: encode_label(x))  
df['content'] = [entry.lower() for entry in df['content']]

X_train, X_test, y_train, y_test = model_selection.train_test_split(df['content'],df['label'],test_size=0.2, shuffle = True)
#y_train = y_train.apply(lambda x: encode_label(x)) 
#Change all the text to lower case. 

# X_train = df['content']
# y_train = df['label']
X_train, y_train, mask_train = loadFile(X_train, y_train.values)
#X_test, y_test = df['content'] , df['label']
X_test, y_test, mask_test = loadFile(X_test, y_test.values)

train = makeDataLoader(X_train, y_train, mask_train)
test = makeDataLoader(X_test, y_test, mask_test)


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

100%|██████████| 700/700 [00:00<00:00, 5834.21it/s]
100%|██████████| 300/300 [00:00<00:00, 6029.36it/s]


In [None]:

TP = 0
TF = 0 
Denominator = 0
optimizer = AdamW(model.parameters(),
                    lr = 2e-5,
                    eps = 1e-8)

epochs = 10

total_steps = len(train) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


loss_values = list()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]

        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.5f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Testing
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Testing...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    result1 = []
    result2 = []
    # Evaluate data for one epoch
    for batch in test:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        TF = prediction(logits)#my changes
        label_ids = b_labels.to("cpu").numpy()
        if len(result1) == 0:
            result1 = logits
            result2 = label_ids
        else:
            result1 = np.append(result1,logits,axis=0)
            result2 = np.append(result2,label_ids)
        tempscore, temp_den = accurate(TF, label_ids) #my changes
        TP+=tempscore # my changes
        Denominator+=temp_den#my changes
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)            # acc
#           tmp_eval_f1 = f1_score(np.argmax(logits, axis = 1).flatten(), label_ids.flatten())
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
#            eval_f1 += tmp_eval_f1

        # Track the number of batches
        nb_eval_steps += 1
    
        

    # Report the final accuracy for this validation run.
    #print(f"myACCURACY{(TP/Denominator)}")#my changes
    print("  Accuracy: {0:.5f}".format(eval_accuracy/nb_eval_steps))
    print("  F1: {0:.5f}".format(f1_score(np.argmax(result1, axis = 1).flatten(), result2.flatten())))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

    model.save_pretrained("models/ctbert_"+str(epoch_i+1))

print("Training complete!")




Training...




  Batch    40  of     44.    Elapsed: 0:00:43.

  Average training loss: 0.58906
  Training epcoh took: 0:00:48

Running Testing...
  Accuracy: 0.77741
  F1: 0.86128
  Validation took: 0:00:07

Training...
  Batch    40  of     44.    Elapsed: 0:00:47.

  Average training loss: 0.37437
  Training epcoh took: 0:00:51

Running Testing...
  Accuracy: 0.78399
  F1: 0.84778
  Validation took: 0:00:08

Training...
  Batch    40  of     44.    Elapsed: 0:00:46.

  Average training loss: 0.10167
  Training epcoh took: 0:00:51

Running Testing...
  Accuracy: 0.71162
  F1: 0.77285
  Validation took: 0:00:08

Training...
  Batch    40  of     44.    Elapsed: 0:00:47.

  Average training loss: 0.03833
  Training epcoh took: 0:00:51

Running Testing...
  Accuracy: 0.79057
  F1: 0.84819
  Validation took: 0:00:08

Training...
  Batch    40  of     44.    Elapsed: 0:00:47.

  Average training loss: 0.00386
  Training epcoh took: 0:00:51

Running Testing...
  Accuracy: 0.79715
  F1: 0.86175
  Validati

In [None]:
# Need one part for predicting future data (without label)
# (similar with testing part, but just output the results instead of comparing with labels)
!cp "/content/drive/MyDrive/Twitter5gcovid.csv" .
!cp "/content/drive/MyDrive/Twitter5gcoronavirus.csv" .
!cp "/content/drive/MyDrive/Twittercurecovid.csv" .
!cp "/content/drive/MyDrive/TwitterCoronavirusHoax.csv" .
!cp "/content/drive/MyDrive/Twittercurecoronavirus.csv" .
!cp "/content/drive/MyDrive/Twitterfaucicoronavirus.csv" .
!cp "/content/drive/MyDrive/Twitterfaucicovid.csv" .
!cp "/content/drive/MyDrive/Twittergatescoronavirus.csv" .
!cp "/content/drive/MyDrive/Twittergatescovid.csv" .
!cp "/content/drive/MyDrive/Twittermicrochip.csv" .
!cp "/content/drive/MyDrive/Twittergatesfoundation.csv" .
tokenizer = BertTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
#filelist = ['5gcovid', "5gcoronavirus",'curecovid', 'CoronavirusHoax',\
#        'curecoronavirus', 'faucicoronavirus', 'faucicovid', 'gatescoronavirus',\
filelist = ['gatescovid', 'microchip', 'gatesfoundation']
#Batch
max_len = 100

from tqdm import tqdm

for j in filelist:
  df = pd.read_csv(f"Twitter{j}.csv")
  contentlist = []
  out  = open(f"{j}predictions.txt", 'w')
  with tqdm(total = len(df)) as pbar:
    for x,y,z in zip(df['content'],df['id'],df['year']):
      m = pbar.update(1)
      inputs = tokenizer(str(x),return_tensors="pt")
      with torch.no_grad():
          inputs = inputs.to(device)
          logits = model(**inputs).logits
      predicted_class_id = logits.argmax().item()
      out.write(f"{x},{predicted_class_id},{y},{z}\n")

cp: cannot stat '/content/drive/MyDrive/Twitter5gcovid.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twitter5gcoronavirus.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittercurecovid.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/TwitterCoronavirusHoax.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittercurecoronavirus.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twitterfaucicoronavirus.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twitterfaucicovid.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittergatescoronavirus.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittergatescovid.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittermicrochip.csv': No such file or directory
cp: cannot stat '/content/drive/MyDrive/Twittergatesfoundation.csv': No such fil

100%|██████████| 8887/8887 [03:34<00:00, 41.50it/s]
100%|██████████| 20461/20461 [08:13<00:00, 41.43it/s]
100%|██████████| 112737/112737 [44:30<00:00, 42.22it/s]
100%|██████████| 24742/24742 [09:52<00:00, 41.78it/s]
100%|██████████| 36289/36289 [14:08<00:00, 42.76it/s]
100%|██████████| 6687/6687 [02:41<00:00, 41.47it/s]
100%|██████████| 8087/8087 [03:16<00:00, 41.22it/s]
100%|██████████| 3526/3526 [01:24<00:00, 41.63it/s]
100%|██████████| 3779/3779 [01:25<00:00, 44.08it/s]
100%|██████████| 571899/571899 [3:34:16<00:00, 44.48it/s]
 31%|███       | 90719/292562 [35:15<1:17:08, 43.61it/s]