# Downloads

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!pip install tsnecuda

In [None]:
!conda install --offline tsnecuda-2.1.0-cuda101.tar.bz2

In [None]:
!wget https://anaconda.org/CannyLab/tsnecuda/2.1.0/download/linux-64/tsnecuda-2.1.0-cuda101.tar.bz2
!tar xvjf tsnecuda-2.1.0-cuda101.tar.bz2
!cp -r site-packages/* /usr/local/lib/python3.7/dist-packages/

In [1]:
!pip install transformers

In [2]:
!pip install datasets

In [3]:
!pip install livelossplot

# Imports

In [4]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import transformers
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from livelossplot import PlotLosses
from torch.utils import data 
import datetime


#import tsnecuda
#from tsnecuda import TSNE as TSNE_CUDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import gc



from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Loading Data

#### Dataset Description



*   The dataset comprises of 12K samples
*   The claims have been classfied as follows

  *   Falset
  *   Mixture 
  *   True
  *   Unproven









In [5]:
from datasets import load_dataset

dataset = load_dataset("health_fact")

In [6]:
#filtering out samples which have -1 as their label
dataset = dataset.filter(lambda x: x['label'] != -1)

# Part A 
# Using the RoBERTa model for text classification


*  In this part, I will be using the sequence classification model from hugging face for RoBERTa to classify the medical claims
*   The model will be finetuned on the training dataset comprising of 9k samples



## Loading Model


1.   The model used here is the RoBERTa
2.   This model has shown significant imprvements over the base BERT model, which is supported by the better perfromance of the model.




In [7]:
bert_version = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(bert_version)

In [8]:
#tokenization

def encode(example):
    encodings = tokenizer(example['main_text'], truncation=True, padding='max_length')
    return { **encodings, 'labels':example['label'] }


tokenized_dataset = dataset.map(encode)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask' ,'labels'])

In [None]:
tokenized_dataset.remove_columns(['claim_id', 'label', 'claim','date_published','explanation','fact_checkers','sources','subjects','main_text'])

In [9]:
data = dict()
data['train'] = torch.utils.data.DataLoader(tokenized_dataset['train'], batch_size=10)
data['validation'] = torch.utils.data.DataLoader(tokenized_dataset['validation'], batch_size=10)
test_data = torch.utils.data.DataLoader(tokenized_dataset['test'], batch_size=10)

In [10]:
from transformers import RobertaForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(bert_version,num_labels=4).to(device)

## Model Training


*   The RoBERTa model was trained for 4 epocchs, in batches of size 10.
*   Due to limited resources, all the epochs could not be completed, hence there exists a keyboard intterupt.



In [11]:
def train_model(model, optimizer, num_epochs=5, batch_size=4):    
    train_loss = []
    curr_loss = {}
    liveloss = PlotLosses()
    # for epoch in tqdmn(range(num_epochs)):

    for epoch in range(num_epochs):
        current_loss = 0
        # for i, batch in enumerate(tqdmn(train_data)):
        for i, batch in enumerate(data['train']):
            model.train()
            #print(batch)
            batch = { k: v.to(device) for k, v in batch.items() }
            outputs = model(**batch)
            loss = outputs[0]
            loss.backward()

            current_loss += loss.item()
            dividor = batch_size * 2 if batch_size < 10 else batch_size
            if i % dividor == 0 and i > 0:
                optimizer.step()
                optimizer.zero_grad()
                train_loss.append(current_loss / (dividor*batch_size))
                
                curr_loss['train loss'] = current_loss/ (dividor*batch_size)
                liveloss.update(curr_loss)
                liveloss.send()
                current_loss = 0

            if i%400 == 0 and i>0:
                model.eval()
                validation_loss = 0
                for i, batch in enumerate(data['validation']):
                    batch = { k: v.to(device) for k, v in batch.items() }
                    outputs = model(**batch)
                    loss = outputs[0]
                    validation_loss += loss.item()
                curr_loss['validation loss'] = validation_loss/(i*batch_size)
                liveloss.update(curr_loss)
                liveloss.send()


        optimizer.step()
        optimizer.zero_grad()

In [12]:
optimizer = optim.AdamW(params=model.parameters(), lr=1e-5)

In [13]:
train_model(model, optimizer, num_epochs=3,batch_size=10)

## Testing the Model


*   test dataset comprising of 1235 samples was used to test the model
*   the results are described as classifcation report and confusion amtrix below.



In [16]:
import tqdm


tqdmn = tqdm.notebook.tqdm
model = model.eval()
num_classes = 4
confusion = torch.zeros(num_classes, num_classes)
y_true, y_pred = [], []
for i, batch in enumerate(tqdmn(test_data)):
    with torch.no_grad():
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model(**batch)
        #print(outputs)
        true_values = batch['labels']
        pred_values = torch.argmax(outputs[1],dim=1)
        y_true.extend(true_values)
        y_pred.extend(pred_values)
        for true, pred in zip(true_values, pred_values):
            confusion[true.item()][pred.item()] += 1
          

In [17]:
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

y_pred = list(map(int, y_pred))
y_true = list(map(int, y_true))
for i in range(num_classes):
    confusion[i] = confusion[i] / confusion[i].sum()
  
print(metrics.classification_report(y_true, y_pred, digits=3))

labels = ['false','mixture','true','unproven']

fig, ax = plt.subplots(figsize=(10, 10))
ax.matshow(confusion.numpy())

ids = np.arange(len(labels))
ax.set_ylabel('True Labels', fontsize='x-large')
ax.set_xlabel('Pred Labels', fontsize='x-large')
ax.set_xticks(ids)
ax.set_xticklabels(labels)
ax.set_yticks(ids)
ax.set_yticklabels(labels)

fig.tight_layout()
plt.show()

In [None]:
torch.save(model,"RoBERT_healthFacts.pt")