# Downloads

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!pip install tsnecuda

In [None]:
!conda install --offline tsnecuda-2.1.0-cuda101.tar.bz2

In [None]:
!wget https://anaconda.org/CannyLab/tsnecuda/2.1.0/download/linux-64/tsnecuda-2.1.0-cuda101.tar.bz2
!tar xvjf tsnecuda-2.1.0-cuda101.tar.bz2
!cp -r site-packages/* /usr/local/lib/python3.7/dist-packages/

In [1]:
!pip install transformers

In [2]:
!pip install datasets

In [3]:
!pip install livelossplot

# Imports

In [4]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import transformers
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from livelossplot import PlotLosses
from torch.utils import data 
import datetime


#import tsnecuda
#from tsnecuda import TSNE as TSNE_CUDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import gc



from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Loading Data

#### Dataset Description



*   The dataset comprises of 12K samples
*   The claims have been classfied as follows

  *   Falset
  *   Mixture 
  *   True
  *   Unproven









In [5]:
from datasets import load_dataset

dataset = load_dataset("health_fact")

In [None]:
#dataset

In [None]:
#dataset['train'][0]

In [None]:
'''labels = dataset['train'].features['label'].names
num_classes = dataset['train'].features['label'].num_classes'''

In [None]:
labels

In [6]:
#filtering out samples which have -1 as their label
dataset = dataset.filter(lambda x: x['label'] != -1)

# Part A 
# Using the RoBERTa model for text classification


*  In this part, I will be using the sequence classification model from hugging face for RoBERTa to classify the medical claims
*   The model will be finetuned on the training dataset comprising of 9k samples



## Loading Model


1.   The model used here is the RoBERTa
2.   This model has shown significant imprvements over the base BERT model, which is supported by the better perfromance of the model.




In [7]:
bert_version = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(bert_version)

In [8]:
#tokenization

def encode(example):
    encodings = tokenizer(example['main_text'], truncation=True, padding='max_length')
    return { **encodings, 'labels':example['label'] }


tokenized_dataset = dataset.map(encode)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask' ,'labels'])

In [None]:
tokenized_dataset.remove_columns(['claim_id', 'label', 'claim','date_published','explanation','fact_checkers','sources','subjects','main_text'])

In [9]:
data = dict()
data['train'] = torch.utils.data.DataLoader(tokenized_dataset['train'], batch_size=10)
data['validation'] = torch.utils.data.DataLoader(tokenized_dataset['validation'], batch_size=10)
test_data = torch.utils.data.DataLoader(tokenized_dataset['test'], batch_size=10)

In [10]:
from transformers import RobertaForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(bert_version,num_labels=4).to(device)

## Model Training


*   The RoBERTa model was trained for 4 epocchs, in batches of size 10.
*   Due to limited resources, all the epochs could not be completed, hence there exists a keyboard intterupt.



In [11]:
def train_model(model, optimizer, num_epochs=5, batch_size=4):    
    train_loss = []
    curr_loss = {}
    liveloss = PlotLosses()
    # for epoch in tqdmn(range(num_epochs)):

    for epoch in range(num_epochs):
        current_loss = 0
        # for i, batch in enumerate(tqdmn(train_data)):
        for i, batch in enumerate(data['train']):
            model.train()
            #print(batch)
            batch = { k: v.to(device) for k, v in batch.items() }
            outputs = model(**batch)
            loss = outputs[0]
            loss.backward()

            current_loss += loss.item()
            dividor = batch_size * 2 if batch_size < 10 else batch_size
            if i % dividor == 0 and i > 0:
                optimizer.step()
                optimizer.zero_grad()
                train_loss.append(current_loss / (dividor*batch_size))
                
                curr_loss['train loss'] = current_loss/ (dividor*batch_size)
                liveloss.update(curr_loss)
                liveloss.send()
                current_loss = 0

            if i%400 == 0 and i>0:
                model.eval()
                validation_loss = 0
                for i, batch in enumerate(data['validation']):
                    batch = { k: v.to(device) for k, v in batch.items() }
                    outputs = model(**batch)
                    loss = outputs[0]
                    validation_loss += loss.item()
                curr_loss['validation loss'] = validation_loss/(i*batch_size)
                liveloss.update(curr_loss)
                liveloss.send()


        optimizer.step()
        optimizer.zero_grad()

In [12]:
optimizer = optim.AdamW(params=model.parameters(), lr=1e-5)

In [13]:
train_model(model, optimizer, num_epochs=3,batch_size=10)

## Testing the Model


*   test dataset comprising of 1235 samples was used to test the model
*   the results are described as classifcation report and confusion amtrix below.



In [16]:
import tqdm


tqdmn = tqdm.notebook.tqdm
model = model.eval()
num_classes = 4
confusion = torch.zeros(num_classes, num_classes)
y_true, y_pred = [], []
for i, batch in enumerate(tqdmn(test_data)):
    with torch.no_grad():
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model(**batch)
        #print(outputs)
        true_values = batch['labels']
        pred_values = torch.argmax(outputs[1],dim=1)
        y_true.extend(true_values)
        y_pred.extend(pred_values)
        for true, pred in zip(true_values, pred_values):
            confusion[true.item()][pred.item()] += 1
          

In [17]:
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

y_pred = list(map(int, y_pred))
y_true = list(map(int, y_true))
for i in range(num_classes):
    confusion[i] = confusion[i] / confusion[i].sum()
  
print(metrics.classification_report(y_true, y_pred, digits=3))

labels = ['false','mixture','true','unproven']

fig, ax = plt.subplots(figsize=(10, 10))
ax.matshow(confusion.numpy())

ids = np.arange(len(labels))
ax.set_ylabel('True Labels', fontsize='x-large')
ax.set_xlabel('Pred Labels', fontsize='x-large')
ax.set_xticks(ids)
ax.set_xticklabels(labels)
ax.set_yticks(ids)
ax.set_yticklabels(labels)

fig.tight_layout()
plt.show()

In [None]:
torch.save(model,"RoBERT_healthFacts.pt")

# Part - B
# Using the mebeddings of the model for training an unsupervised learning algortihm (KNN)



*   In the previous approach, I observed that the majority of the test samples were having tokens > 512 (maximum number of tokens the model can work with) 
*   In the previous approach, the extra tokens were discarded, whcih was loss of crucial information.
*  So, to overcome that, this apporach is used
*In this approach, the extra tokens are passed to the model as sections comprising of 512 tokens, to obtain the embeddings
*once the embeddings was obtained, for a large text the, there were mmore than one embedding vectors.
*to overcome that, the average of all these vectors was taken to obtain a single embedding.
* these vectors are then used by the K-Nearest neighbor model to obtain train and predict the outputs of the unknown (test data).  
*A key point to note here is that the model that has been used for this section, has not been finetuned on the dataset.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaModel.from_pretrained(bert_version).to(device)

In [None]:
def modelOutputs(batch):
  with torch.no_grad():
    outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device),output_hidden_states=True)
    hidden_states = outputs[1]
    #print(hidden_states)
    pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, -1]]), dim=-1)
    pooled_output = pooled_output[:, 0, :]
    
    t = pooled_output[0]
  return t


In [None]:
encodings = tokenizer(
          dataset['train'][1]['main_text'], 
          truncation=True, 
          padding='max_length',
          stride = 50,
          return_overflowing_tokens=True,
          return_tensors = "pt"
          )

In [None]:
encodings

In [None]:
l = modelOutputs(encodings)

In [None]:
len(l[1])

In [None]:
hidden_states = l[1]

In [None]:
l[0]

In [None]:
pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, -1]]), dim=-1)

In [None]:
pooled_output[0]

In [None]:
pooled_output = pooled_output[:, 0, :]

In [None]:
pooled_output

In [None]:
torch.tensor([2]*100)

In [None]:
max_len_of_tokens_in_BERT = 510
ATTN_MASK = torch.tensor([[1]*512])
START_TOKEN = torch.tensor([0]) #start token for RoBERTa,
END_TOKEN = torch.tensor([2]) #end token for RoBERTa,

def getEncodedLargeSnetences(num_tokens,tokens, encd):
  """this function encoded the large senetences"""
  """
  Inputs -  number of extra tokens, extra tokens, and the first 512 encoded tokens.
  Outputs - encoded string by BERT model for the senetence
  functionality - Get the broken down embeddings of large text, take the average of these embeddings
  """
  i = 0
  n = 1 #used to calculate the toal number of sets for the large text.
  encd.pop('overflowing_tokens')
  encd.pop('num_truncated_tokens')
  batch = { k: v.to(device) for k, v in encd.items() }

  temp = modelOutputs(batch)

  encodings = None
 
  while i+max_len_of_tokens_in_BERT < num_tokens:
    attn_mask = None
    n+=1

    if i+max_len_of_tokens_in_BERT < num_tokens:
      #inital set of extra tokens
      #lasredy has the start token
      reduced_text = torch.cat((START_TOKEN,tokens[i:i+max_len_of_tokens_in_BERT],END_TOKEN),0)
      attn_mask = ATTN_MASK
          
    else:
      #final set of extra tokens
      #already has the end token
      j = num_tokens - i
      reduced_text = torch.cat((START_TOKEN,tokens[i:num_tokens],END_TOKEN,torch.tensor([1]*(512-j))),0)
      attn_mask = torch.cat((torch.tensor([1]*j),torch.tensor([0]*(512-j),0)))
      attn_mask = attn_mask.unsqueeze(0)

    reduced_text = reduced_text.unsqueeze(0)

    encodings = {'input_ids': reduced_text,'attention_mask': attn_mask} #input tyoe for BERT model created
    inputs = transformers.tokenization_utils_base.BatchEncoding(encodings)
    batch = { k: v.to(device) for k, v in inputs.items() }
    #print(batch)
    outputs = modelOutputs(batch)
    
    if temp == None:
      temp = outputs
    else:
      temp.add(outputs)

    i+=(max_len_of_tokens_in_BERT-50)#having 50 tokens from the previous set, overlapped

  temp = torch.div(temp, n) #avergae of all the vecors associated with the large text
  return temp  

In [None]:
def getEmbeddings(dataSplit, encoded_array=None):
  """this function converts the text into embeddings"""
  """
  Inputs -  the dataset type (train, test,valiadtion)
  Outputs - an array of encoded string by BERT model.
  """
  for i in dataset[dataSplit]:
    tmp = None
    encodings = tokenizer(
          i['main_text'], 
          truncation=True, 
          padding='max_length',
          stride = 50,
          return_overflowing_tokens=True,
          return_tensors = "pt"
          ) #tokenier is applied
    if 'overflowing_tokens' in encodings.keys():
      if len(encodings['overflowing_tokens']) > 0:
        #if there are overflow tokens, then the text is large and needs to handled
        remianing_token_size = encodings['num_truncated_tokens'][0]
        remaining_tokens = encodings['overflowing_tokens'][0]
        tmp = getEncodedLargeSnetences(remianing_token_size,remaining_tokens,encodings) #obtain the single embedding vector for the large text
      

    else:
      #case where the text is small, <512 tokens.
      encodings.pop('overflowing_tokens')
      encodings.pop('num_truncated_tokens')
      batchn = { k: v.to(device) for k, v in encodings.items() }
      tmp = modelOutputs(batchn)
      


    if encoded_array == None:
      encoded_array = tmp
    else:
      encoded_array = torch.vstack((encoded_array,tmp))
  return encoded_array  

In [None]:
#for obtaining training data
def getLabels(dataSplit,labelArr):
  """this function obtains the labels for samples"""
  """
  Inputs -  the dataset type (train, test,valiadtion)
  Outputs - an array of labels.
  """
  for i in dataset[dataSplit]:
    if i['label'] == 0:
      labelArr.append('False')
    elif i['label'] == 1:
      labelArr.append('Mixture')
    elif i['label'] == 2:
      labelArr.append('True')
    else:
      labelArr.append('Unproven')

  return labelArr


def obtainSplitWiseEmbeddings(dataSplit,mergeTrainValidation=False):
  """this function obtains the labels and embeddings vectors for samples"""
  """
  Inputs -  the dataset type (train, test,valiadtion)
  Outputs - an array of labels and embedding vector for that set.
  """
  encoded_array, labelArr = None, []
  if dataSplit == "train":
    if mergeTrainValidation:

      encoded_array = getEmbeddings('train',encoded_array)
      encoded_array = getEmbeddings('validation', encoded_array)
      labelArr = getLabels('train',[])
      labelArr = getLabels('validation',labelArr)
    else:
      encoded_array = getEmbeddings('train',encoded_array)
      labelArr = getLabels('train',[])

  else:
    encoded_array = getEmbeddings(dataSplit,encoded_array)
    labelArr = getLabels(dataSplit,[])
      
  return [encoded_array,labelArr]

In [None]:
torch.save(model, '/kaggle/working/model.pt')

In [None]:
#saving the embeddings for train


trainEmbeddings, trainLabels = obtainSplitWiseEmbeddings('train',False)
#torch.save(trainEmbeddings, trainEncodeFileName)


In [None]:
len(trainEmbeddings)

In [None]:
#saving the embeddings for the test


testEmbeddings, testLabels = obtainSplitWiseEmbeddings('test',False)
#torch.save(testEmbeddings, testEncodeFileName)

In [None]:
trainEncodeFileName = '/kaggle/working/encoded.pt'
trainLabelFileName = '/kaggle/working/labels.pt'
testEncodeFileName = '/kaggle/working/encodedTest.pt'
testLabelFileName = '/kaggle/working/testLabels.pt'

In [None]:
torch.save(trainEmbeddings,trainEncodeFileName)
torch.save(testEmbeddings,testEncodeFileName)


#torch.save(torch.tensor(trainLabels),trainLabelFileName)
#torch.save(torch.tensor(testLabels),testLabelFileName)

In [None]:
import pickle
with open(trainLabelFileName, 'wb') as file:
      
    # A new file will be created
    pickle.dump(trainLabels,file)
    #pickle.dump(myvar, file)


In [None]:
with open(testLabelFileName, 'wb') as file:
      
    # A new file will be created
    pickle.dump(testLabels,file)
    #pickle.dump(myvar, file)

In [None]:
#utilitied to draw the scatter plot
from sklearn.manifold import TSNE
def getColor(val):
  if val == "False":
    return "red"
  elif val == "Mixture":
    return "blue"
  elif val == "True":
    return "green"
  else:
    return "black"


#TSNE dimensionality reductio for reducing the dimension of 
#embeddings to 2, in order to plot them.


#tsne_cuda = TSNE(n_components=2, verbose=0)
#newArr = tsne_cuda.fit_transform()
newArr = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(trainEmbeddings.cpu().numpy())
df = pd.DataFrame(newArr, columns=["x", "y"])
df["val"] = pd.Series(trainLabels).apply(lambda x: getColor(x))

In [None]:
def scatterPlot(df):
  """this function draws the scatter plot for given data"""
 
  plt.figure(figsize=(16,10))
  palette = sns.hls_palette(4, l=.6, s=.9)
  sns.scatterplot(
      x= df['x'], y= df['y'], c=df['val'],
      palette= palette,
      legend="full",
      hue=labelArr,
      alpha=0.3,
  )
  plt.show()

## Scatter plot



*   As we can see from the scatter plot below, the data is spread across.
*   Looking at this, we can form multiple clusters
* but since we have only 4 classes, we would need only four clusters.
* this might result in improper classification for classes that are spread across or are mixed together, like the false and mixture.
* to overcome this, i use the KNN algortihm.



In [None]:
scatterPlot(df)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def getClassifier(k):
  neigh = KNeighborsClassifier(n_neighbors=k, weights='distance',algorithm='kd_tree' )
  neigh.fit(encoded_array.cpu().numpy(), labelArr)
  return neigh

In [None]:
def getPreds(neigh):
  predictions = []
  for i in testEmbeddings.cpu().numpy():
    predictions.append(neigh.predict(i.reshape(1,-1)))

  return predictions

In [None]:
#k=5
neig = getClassifier(5)
predictions = getpreds(neig)
print(classification_report(testLabels, predictions, digits=3))


In [None]:
#k=10
neig = getClassifier(10)
predictions = getpreds(neig)
print(classification_report(testLabels, predictions, digits=3))

# Improvements



*   As seen above, wven without the finetuning of the dataset, approach B had comparable results
*   This can be further imprvoed by finetuning the model on the dataset
*Furthermore, BERT based model trained on the medical datasets like BioBERT, and clinical data like the clinicalBERT can be used for this task.
*unfortunately, it could not be implemented here because of limited resources availability

