In [1]:
# import tensorflow as tf
# # Getting GPU device name.
# device_name = tf.test.gpu_device_name()

# if device_name == '/device:GPU:0':
#     print('Found GPU at: {}'.format(device_name))
# else:
#     raise SystemError('GPU device not found')

In [2]:
import torch
# If a GPU is available
if torch.cuda.is_available():
    #set device to GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
We will use the GPU: NVIDIA A16


In [3]:
#Importing necessary libraries
!pip install transformers

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import seaborn        as sns

import transformers
from transformers                     import  RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch



from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer

from torch                            import nn, optim
from torch.utils                      import data
from sklearn.decomposition            import PCA

#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED)
   torch.backends.cudnn.deterministic = True
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['AGAINST','FAVOUR','NONE']
MAX_LENGTH = 200
BATCH_SIZE = 4
EPOCHS = 6
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

[0m

In [4]:
#Converting labels to numbers
def label_to_int(label):
  if label   == 'AGAINST':
    return 0
  elif label == 'FAVOR':
    return 1
  elif label == 'NONE':
    return 2


#Pre-processing Twitter and Reddit Posts to handle URLs and Mentions.
#Replaces URLs with $URL$ and mentions with $MENTION$
def processText(text):
  text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$",text.strip())
  text = re.sub(r"(@[A-Za-z0-9]+)", "$MENTION$", text.strip())

  return text

In [5]:
'''Processing all of Twitter and Reddit data frames to
    1. Get rid of all NaN values
    2. Remove columns not useful for the Model
    3. Process text
    4. Return a combined frame consisting of both Twitter and Reddit data'''


def processStanceData(RedditDf):
    # Concatenating Reddit data (you can add more frames if needed)
    frames = [RedditDf]
    resultDf = pd.concat(frames)

    # Getting rid of NaN values
    resultDf = resultDf.replace(np.nan, '', regex=True)
    #print("resultDf##############",resultDf)
    # Converting labels to numbers
    resultDf['labelValue'] = resultDf['Stance'].apply(label_to_int)

    # Concatenating previousText and sourceText into previousPlusSrcText
   

   
    # Processing text fields if needed
    resultDf['Tweet'] = resultDf['Tweet'].apply(processText)
    resultDf['Target'] = resultDf['Target'].apply(processText)

    return resultDf


In [6]:
import pandas as pd

# Define the column names
column_names = ['ID', 'Target', 'Tweet', 'Stance', 'Opinion towards', 'Sentiment', 'labelValue']

# Read the text files with specified column names
redditTrainDf = pd.read_csv('semeval_train.txt', delimiter='\t', header=None, names=column_names)
redditDevDf = pd.read_csv('semeval_test.txt', delimiter='\t', header=None, names=column_names)
redditTestDf = pd.read_csv('semeval_test.txt', delimiter='\t', header=None, names=column_names)

# Remove any rows where 'ID' is not numeric (to handle repeated headers or invalid rows)
redditTrainDf = redditTrainDf[pd.to_numeric(redditTrainDf['ID'], errors='coerce').notnull()]
redditDevDf = redditDevDf[pd.to_numeric(redditDevDf['ID'], errors='coerce').notnull()]
redditTestDf = redditTestDf[pd.to_numeric(redditTestDf['ID'], errors='coerce').notnull()]

# Save the DataFrames to CSV files without the index
redditTrainDf.to_csv('semeval_train.csv', index=False)
redditDevDf.to_csv('semeval_dev.csv', index=False)
redditTestDf.to_csv('semeval_test.csv', index=False)

print("Files converted to CSV successfully.")


Files converted to CSV successfully.


In [7]:
#Reading Twitter and Reddit data (train, dev and test) onto dataFrames
#twitterTrainDf  = pd.read_csv(io.StringIO(uploaded['TwitterTrainDataSrc.csv'].decode('utf-8')))
redditTrainDf   = pd.read_csv('semeval_train.csv')

#twitterDevDf    = pd.read_csv(io.StringIO(uploaded['TwitterDevDataSrc.csv'].decode('utf-8')))
redditDevDf     = pd.read_csv('semeval_test.csv')

#twitterTestDf   = pd.read_csv(io.StringIO(uploaded['TwitterTestDataSrc.csv'].decode('utf-8')))
redditTestDf    = pd.read_csv('semeval_test.csv')

#Processing Twitter and Reddit dataframe containig training data
trainDf = processStanceData(redditTrainDf)
trainDf

Unnamed: 0,ID,Target,Tweet,Stance,Opinion towards,Sentiment,labelValue
0,1,Hillary Clinton,"$MENTION$ And, #HandOverTheServer she wiped cl...",AGAINST,TARGET,NEGATIVE,0
1,2,Hillary Clinton,Hillary is our best choice if we truly want to...,FAVOR,TARGET,POSITIVE,1
2,3,Hillary Clinton,$MENTION$ I think our country is ready for a f...,AGAINST,TARGET,NEGATIVE,0
3,4,Hillary Clinton,I just gave an unhealthy amount of my hard-ear...,AGAINST,TARGET,NEGATIVE,0
4,5,Hillary Clinton,$MENTION$ Thank you for adding me to your list...,NONE,NO ONE,POSITIVE,2
...,...,...,...,...,...,...,...
2909,2910,Legalization of Abortion,"There's a law protecting unborn eagles, but no...",AGAINST,TARGET,NEGATIVE,0
2910,2911,Legalization of Abortion,I am 1 in 3... I have had an abortion #Abortio...,AGAINST,OTHER,NEITHER,0
2911,2912,Legalization of Abortion,How dare you say my sexual preference is a cho...,AGAINST,OTHER,NEGATIVE,0
2912,2913,Legalization of Abortion,"Equal rights for those 'born that way', no rig...",AGAINST,OTHER,NEGATIVE,0


In [8]:
#Processing Twitter and Reddit dataframe containig development data
devDf = processStanceData(redditDevDf)
devDf

Unnamed: 0,ID,Target,Tweet,Stance,Opinion towards,Sentiment,labelValue
0,10675,Hillary Clinton,#mtp $MENTION$ How is deleting emails -part of...,AGAINST,OTHER,NEGATIVE,0
1,10676,Hillary Clinton,$MENTION$ $MENTION$ AndrewWhyDoYouCareAboutWha...,AGAINST,OTHER,NEGATIVE,0
2,10677,Hillary Clinton,The white male vote is solidly GOP. The black ...,AGAINST,OTHER,NEGATIVE,0
3,10678,Hillary Clinton,$MENTION$ big banker buds need to ratchet up t...,AGAINST,TARGET,NEGATIVE,0
4,10679,Hillary Clinton,$MENTION$ Why should I believe you on this? Th...,AGAINST,OTHER,NEGATIVE,0
...,...,...,...,...,...,...,...
1244,11245,Legalization of Abortion,$MENTION$ $MENTION$_six I followed him before ...,NONE,OTHER,NEGATIVE,2
1245,11246,Legalization of Abortion,"For he who avenges blood remembers, he does no...",AGAINST,TARGET,NEITHER,0
1246,11247,Legalization of Abortion,Life is sacred on all levels. Abortion does no...,AGAINST,TARGET,NEITHER,0
1247,11248,Legalization of Abortion,"$MENTION$ U refer to ""WE"" which =""YOU"" & a min...",AGAINST,TARGET,NEGATIVE,0


In [9]:
#Processing Twitter and Reddit dataframe containig test data
testDf = processStanceData(redditTestDf)
testDf

Unnamed: 0,ID,Target,Tweet,Stance,Opinion towards,Sentiment,labelValue
0,10675,Hillary Clinton,#mtp $MENTION$ How is deleting emails -part of...,AGAINST,OTHER,NEGATIVE,0
1,10676,Hillary Clinton,$MENTION$ $MENTION$ AndrewWhyDoYouCareAboutWha...,AGAINST,OTHER,NEGATIVE,0
2,10677,Hillary Clinton,The white male vote is solidly GOP. The black ...,AGAINST,OTHER,NEGATIVE,0
3,10678,Hillary Clinton,$MENTION$ big banker buds need to ratchet up t...,AGAINST,TARGET,NEGATIVE,0
4,10679,Hillary Clinton,$MENTION$ Why should I believe you on this? Th...,AGAINST,OTHER,NEGATIVE,0
...,...,...,...,...,...,...,...
1244,11245,Legalization of Abortion,$MENTION$ $MENTION$_six I followed him before ...,NONE,OTHER,NEGATIVE,2
1245,11246,Legalization of Abortion,"For he who avenges blood remembers, he does no...",AGAINST,TARGET,NEITHER,0
1246,11247,Legalization of Abortion,Life is sacred on all levels. Abortion does no...,AGAINST,TARGET,NEITHER,0
1247,11248,Legalization of Abortion,"$MENTION$ U refer to ""WE"" which =""YOU"" & a min...",AGAINST,TARGET,NEGATIVE,0


In [10]:
#Creates a dataset which will be used to feed to RoBERTa
class StanceDataset(data.Dataset):

  def __init__(self, firstSeq, secondSeq, TextSrcInre, labelValue,  tokenizer, max_len):
    self.firstSeq    = firstSeq      #First input sequence that will be supplied to RoBERTa
    self.secondSeq   = secondSeq     #Second input sequence that will be supplied to RoBERTa
    self.TextSrcInre = TextSrcInre   #Concatenation of reply+ previous+ src text to get features from 1 training example
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    firstSeq    = str(self.firstSeq[item])
    secondSeq   = str(self.secondSeq[item])
    TextSrcInre = str(self.TextSrcInre[item])

    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        firstSeq,
        secondSeq,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'firstSeq' : firstSeq,
        'secondSeq' : secondSeq,
        'TextSrcInre': TextSrcInre,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labelValue'  : torch.tensor(self.labelValue[item], dtype=torch.long)
    }


In [11]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = StanceDataset(
      firstSeq    = dataframe.Tweet.to_numpy(),
      secondSeq   = dataframe.Target.to_numpy(),
      TextSrcInre = dataframe.TextSrcInre.to_numpy(),
      labelValue  = dataframe.labelValue.to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = True,
      num_workers = 4
  )


In [12]:
#Combining the reply, previous and source texts to get features for 1 training example
trainDf['TextSrcInre'] = trainDf['Tweet'].str.cat(trainDf['Target'],sep=" ")
devDf['TextSrcInre']   = devDf['Tweet'].str.cat(devDf['Target'],sep=" ")
testDf['TextSrcInre']  = testDf['Tweet'].str.cat(testDf['Target'],sep=" ")


#Creating data loader for training data
trainDataLoader        = createDataLoader(trainDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(devDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(testDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [13]:
#Instantiating the tf-idf vectorizer object
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))

xtrain = trainDf['Target'].tolist()
x_train_feats = tfidf.fit(xtrain)
print(x_train_feats)
print(len(x_train_feats.get_feature_names_out()))


x_train_transform = x_train_feats.transform(xtrain)
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print(x_train_transform.shape)


pca = PCA(n_components=21)
p = pca.fit(tfidf_transform_tensor)
#print(p.shape)
#print(p)
X = p.transform(tfidf_transform_tensor)
#torch.from_numpy(X.values)
X = torch.from_numpy(X)
#tfidf_transform_tensor_pca = torch.tensor(scipy.sparse.csr_matrix.todense(X)).float()
#print(X.type())
#print(X.shape)
#print(X)


TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
21
(2914, 21)


In [14]:
#This class defines the model that was used to pre-train a SNN on TF-IDF features
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()

        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names_out()), HIDDEN_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_UNITS, 3)
        self.dropout = nn.Dropout(0.1)

        # Defining tanh activation and softmax output
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)

        #Returning the ouputs from the hidden layer and the final output layer
        return  y, z


In [15]:
#Loading the already trained MLP model that was trained on TF-IDF features.

# from google.colab import drive
# drive.mount('/content/gdrive')
snnmodel = Tfidf_Nn()

# model_save_name = 'pre-trainedTfidf.pt'
# path = F"/content/gdrive/My Drive/{model_save_name}"

# snnmodel.load_state_dict(torch.load(path))
# snnmodel.eval()

In [16]:
'''This class defines the model that will be used for
training and testing on the dataset.

Adapted from huggingFace
This RoBERTa model from huggingface outputs the last hidden states
and the pooled output by default. Pooled output is the classification
token (1st token of the last hidden state) further processed by a Linear
layer and a Tanh activation function.

The pre-trained RoBERTa model is used as the primary model.
This class experiments with RoBERTa and its ensemble with TF-IDF features.
roberta-only :            No ensembling. This just fine-tunes the RoBERTa model.
                          The pooled output is passed through a linear layer and
                          softmax function is finally used for preictions.

roberta-tfIdf :           This model conatenates the 1st token of last-hidden layer
                          from RoBERTa with TF-IDF features. Various ways of this
                          concatenation was experimented (using pooled output instead
                          of 1st token of last hidden layer etc)

roberta-pcaTfidf :        This model concatenates the pooled output from
                          RoBERTa with the PCA transformed vector.

roberta-preTrainedTfIdf : This model concatenates the pooled output from
                          RoBERTa with the hidden layer output from a pre-trained
                          SNN that was trained on TF-IDF features.

Used dropout to prevent over-fitting.'''

class StanceClassifier(nn.Module):

  def __init__(self,  n_classes):
    super(StanceClassifier, self).__init__()
    self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base
    self.model_TFIDF               = snnmodel                                        #Pre-trained SNN trained with TF-IDF features

    self.drop                      = nn.Dropout(p = 0.3)

    self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

    self.input_size_tfidf_only     = self.robertaModel.config.hidden_size + len(tfidf.get_feature_names_out())
    self.input_size_tfidf_pca      = self.robertaModel.config.hidden_size + HIDDEN_UNITS

    self.dense                     = nn.Linear( self.input_size_tfidf_only,  self.input_size_tfidf_only)
    self.out_proj                  = nn.Linear( self.input_size_tfidf_only, n_classes)
    self.out_pca                   = nn.Linear( self.input_size_tfidf_pca, n_classes)

    self.input_size_preTrain_tfidf = self.robertaModel.config.hidden_size +  HIDDEN_UNITS
    self.out                       = nn.Linear(self.input_size_preTrain_tfidf, n_classes)

    self.softmax                   = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, inputs_tfidf_feats, pca_transformed_feats, modelType):

    roberta_output     = self.robertaModel(
        input_ids      = input_ids,               #Input sequence tokens
        attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)

    if modelType   == 'roberta-only':
      pooled_output = roberta_output[1]           #Using pooled output
      output        = self.drop(pooled_output)
      output        = self.output(output)

    elif modelType == 'roberta-tfIdf':
      soutput = roberta_output[1]#---------        experimenting with pooled output
      #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
      x       = torch.cat((soutput, inputs_tfidf_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_proj(x)

    elif modelType == 'roberta-pcaTfidf':
      soutput = roberta_output[1]
      x       = torch.cat((soutput, pca_transformed_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_pca(x)

    elif modelType == 'roberta-TrainedTfIdf':
      tfidf_hidddenLayer, tfidf_output = self.model_TFIDF(inputs_tfidf_feats)
      #print(tfidf_hidddenLayer.shape)
      #print(tfidf_output.shape)

      #Conactenating pooled output from RoBERTa with the hidden layer from the pre-trained SNN using TF-IDF features.
      #pooled_output = torch.cat((roberta_output[1], tfidf_output) , dim=1)-------- Experimenting with Output of pre-trained SNN
      pooled_output = torch.cat((roberta_output[1], tfidf_hidddenLayer) , dim=1)
      output        = self.drop(pooled_output)
      output        = self.out(output)

    return self.softmax(output)



In [17]:
'''from google.colab import drive
drive.mount('/content/gdrive')
snnmodel = Tfidf_Nn()

model_save_name = 'pre-trainedTfidf.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"

snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()
model = StanceClassifier(len(CLASS_NAMES))

#Loading fine-trained RoBERTa model on the same dataset
model_save_name = 'RoBERTaLarge_TFIDFV2.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))
model.eval()
model = model.to(device)


# = StanceClassifier(len(CLASS_NAMES))
#model = model.to(device)
print(model)

print(snnmodel)'''



'from google.colab import drive\ndrive.mount(\'/content/gdrive\')\nsnnmodel = Tfidf_Nn()\n\nmodel_save_name = \'pre-trainedTfidf.pt\'\npath = F"/content/gdrive/My Drive/{model_save_name}"\n\nsnnmodel.load_state_dict(torch.load(path))\nsnnmodel.eval()\nmodel = StanceClassifier(len(CLASS_NAMES))\n\n#Loading fine-trained RoBERTa model on the same dataset\nmodel_save_name = \'RoBERTaLarge_TFIDFV2.pt\'\npath = F"/content/gdrive/My Drive/{model_save_name}"\nmodel.load_state_dict(torch.load(path))\nmodel.eval()\nmodel = model.to(device)\n\n\n# = StanceClassifier(len(CLASS_NAMES))\n#model = model.to(device)\nprint(model)\n\nprint(snnmodel)'

In [18]:
#Instantiating a StanceClassifier object as our model and loading the model onto the GPU.
model = StanceClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(),
                  lr = 2e-6,
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

'''Using class-weights to accomodate heavily imbalanced data.
These weights were learnt by running several experiments using
other weights and the weights that produced the best results have
finally been used here'''

weights      = [8.0, 84.0, 8.0, 1.0]
classWeights = torch.FloatTensor(weights)
lossFunction = nn.CrossEntropyLoss(weight = classWeights).to(device)




In [20]:
#This function is used for training the model.
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []
  correctPredictions = 0

  for d in dataLoader:

    input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
    attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
    labelValues            = d["labelValue"].to(device)                          #Loading label value to GPU
    textSrcInre            = d["TextSrcInre"]
    tfidf_transform        = x_train_feats.transform(textSrcInre)
    tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()
    pca_tensor             = p.transform(tfidf_transform_tensor)

    pca_tensor = torch.from_numpy(pca_tensor).float()
    pca_tensor = pca_tensor.to(device)
    tfidf_transform_tensor = tfidf_transform_tensor.to(device)

    #Getting the output from our model (Object of StanceClassification class) for train data
    outputs = model(
      input_ids             = input_ids,
      attention_mask        = attention_mask,
      inputs_tfidf_feats    = tfidf_transform_tensor,
      pca_transformed_feats = pca_tensor,
      modelType             = 'roberta-TrainedTfIdf'
    )

    #Determining the model predictions
    _, predictionIndices = torch.max(outputs, dim=1)
    loss = lossFunction(outputs, labelValues)

    #Calculating the correct predictions for accuracy
    correctPredictions += torch.sum(predictionIndices == labelValues)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses), correctPredictions.double() / n_examples


In [21]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model,
    dataLoader,
    lossFunction,
    device,
    n_examples
    ):

  model = model.eval()
  losses = []
  correctPredictions = 0

  with torch.no_grad():
    for d in dataLoader:

      input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
      attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
      labelValues            = d["labelValue"].to(device)                         #Loading label values to GPU
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = x_train_feats.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

      pca_tensor             = p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model for dev data
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      #Determining the model predictions
      _, predictionIndices = torch.max(outputs, dim=1)
      loss = lossFunction(outputs, labelValues)

      #Calculating the correct predictions for accuracy
      correctPredictions += torch.sum(predictionIndices == labelValues)
      losses.append(loss.item())

  return np.mean(losses), correctPredictions.double() / n_examples


In [22]:
#fine tuning ROBERTa and validating it

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}')
  trainLoss, trainAccuracy = train_epoch(
    model,
    trainDataLoader,
    lossFunction,
    optimizer,
    device,
    scheduler,
    len(trainDf)
  )

  print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')

  devLoss, devAccuracy = eval_model(
    model,
    developmentDataLoader,
    lossFunction,
    device,
    len(devDf)
  )

  print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
  print()

  print()


Epoch 1




Training loss 1.187934927407279 Training accuracy 0.3291008922443377




Development loss 1.121121697151623 Development accuracy 0.5796637309847877


Epoch 2




Training loss 1.1147103273655978 Training accuracy 0.5343170899107755




Development loss 1.0310820645798509 Development accuracy 0.5524419535628502


Epoch 3




Training loss 1.0184522524619135 Training accuracy 0.6156485929993136




Development loss 1.002348232193115 Development accuracy 0.6965572457966372


Epoch 4




Training loss 0.9691773465794954 Training accuracy 0.7223747426218257




Development loss 0.9903921208823451 Development accuracy 0.7029623698959167


Epoch 5




Training loss 0.9231210699297273 Training accuracy 0.7786547700754975




Development loss 0.9690938772865758 Development accuracy 0.7277822257806245


Epoch 6




Training loss 0.9159954028365053 Training accuracy 0.7961564859299931




Development loss 0.9721457230778167 Development accuracy 0.7245796637309847




In [23]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()
  review_texta = []
  review_textb = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      textas                 = d["firstSeq"]
      textbs                 = d["secondSeq"]
      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labelValue"].to(device)
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = tfidf.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

      pca_tensor             =  p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

      review_texta.extend(textas)
      review_textb.extend(textbs)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()

  return review_texta, review_textb, predictions, prediction_probs, real_values

In [24]:
#Getting model predictions on dev dataset
firstSeq_dev, secondSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)



In [26]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
CLASS_NAMES = ['AGAINST','FAVOUR','NONE']
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     AGAINST       0.88      0.69      0.77       715
      FAVOUR       0.59      0.84      0.69       304
        NONE       0.62      0.70      0.65       230

    accuracy                           0.72      1249
   macro avg       0.70      0.74      0.71      1249
weighted avg       0.76      0.72      0.73      1249



In [None]:
# #Saving the model onto the drive
# from google.colab import drive
# drive.mount('/content/gdrive')

# model_save_name = 'RoBERTaLarge_TFIDFV2.pt'
# path = F"/content/gdrive/My Drive/{model_save_name}"
# torch.save(model.state_dict(), path)

In [27]:
#Getting model predictions on test dataset
firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)



In [28]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

     AGAINST       0.88      0.69      0.77       715
      FAVOUR       0.59      0.84      0.69       304
        NONE       0.62      0.70      0.65       230

    accuracy                           0.72      1249
   macro avg       0.70      0.74      0.71      1249
weighted avg       0.76      0.72      0.73      1249



In [None]:
#Saving the predictions onto a CSV file for error analysis
zippedList =  list(zip(firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test ))
dfObj = pd.DataFrame(zippedList, columns = ['Texta' , 'Textb', 'Ypred', 'YpredsProbs', 'label'])

from google.colab import drive
drive.mount('drive')

dfObj.to_csv('dataPredsFromRoberta_TFIDFV2.csv')
!cp dataPredsFromRoberta_TFIDFV2.csv "drive/My Drive/"