In [1]:
import os
import torch
import logging
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4090


In [2]:
from transformers import AutoTokenizer,AutoModel

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained('sileod/deberta-v3-base-tasksource-nli', do_lower_case=True)

Loading BERT tokenizer...




In [3]:
import json
with open("NumTemp-E9C0/output/bm25_top_100_train") as f:
    train_data = json.load(f)
with open('NumTemp-E9C0/output/bm25_top_100_val') as f:
    val_data = json.load(f)
len(train_data), len(val_data)

(9935, 3084)

In [4]:
train_data[-1]

{'country_of_origin': 'usa',
 'label': 'Conflicting',
 'url': 'https://www.politifact.com/factchecks/2016/may/03/alan-grayson/grayson-right-about-lack-paid-vacation-us-misses-d/',
 'lang': 'en',
 'claim': '"There\'s only three countries in the entire world wherethere\'s no paid vacation by law, we happen to be one of them."',
 'doc': 'American workers aren’t getting the compensation they deserve, in either time or money, U.S. Rep. Alan Grayson said at a recent Orlando debate. At a Senate campaign debate with U.S. Rep. David Jolly, R-Indian Shores, on April 25, 2016, Grayson said he supported a higher minimum wage. Providing more pay won’t hurt the businesses, he said, because other nations have higher wages with no ill effects. Furthermore, many countries also guarantee paid time off and their economies are fine, Grayson said. But not the United States. "There\'s only three countries in the entire world wherethere\'s no paid vacation by law, we happen to be one of them. The other two a

In [5]:
val_data[-1]

{'crawled_date': '2022-10-07T00:00:06',
 'country_of_origin': 'usa',
 'label': 'False',
 'url': 'https://www.snopes.com/fact-check/never-forget-january-6th/',
 'lang': 'en',
 'claim': 'Social media posts with pictures of gas prices and captions urging people to “Never forget January 6th” accurately documented the cost of fuel on the day of the 2021 U.S. Capitol riot.',
 'doc': 'The gas prices pictured in two different shared pictures did not reflect the national average price for regular unleaded gasoline on Jan. 6, 2021. Further, reverse image searches found that both photographs were taken while former U.S. President Barack Obama was in his second term in the White House. On June 12, 2022, the Cullman Daily Facebook page, which is managed by a self-described “conservative and Christian news organization,” posted a picture of an Exxon gas station sign showing the price for regular unleaded fuel at $1.73 per gallon, with the caption, “We will never forget January 6th, 2021.” At the tim

In [6]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
k = 100

In [7]:
def get_features(data):
    features = []
    evidences = []
    
    for index, fact in enumerate(data):
        claim = fact["claim"]
        evidences = fact['top_n'][:k]
        evidences = ''.join(f'[Evidence]:{e}' for e in evidences)
        
        feature = "[Claim]:" + claim + evidences
        features.append(feature)
    return features

In [8]:
train_features = get_features(train_data)
val_features = get_features(val_data)
len(train_features), len(val_features)

(9935, 3084)

In [9]:
train_features[-1]

'[Claim]:"There\'s only three countries in the entire world wherethere\'s no paid vacation by law, we happen to be one of them."[Evidence]:18 ago 2023  ... days of paid vacation in all eu countries. several countries offer even ... while swedes are guaranteed 25 paid vacation days by law, her ...[Evidence]:jun 22, 2023  if the plan becomes law, the united states will no longer be one of six countries in the world  and the only rich country  without any form ...[Evidence]:22 juin 2023  if the plan becomes law, the united states will no longer be one of six countries in the world  and the only rich country  without any form of ...[Evidence]:de r ray  2013  cit 105 fois  vacation and holiday laws. figure 1 summarizes the legal right to paid vacation for 21 of the richest countries in the world (see also table 1).24 pages[Evidence]:24 dc. 2019  for the next three decades, the two weeks of paid vacation time ... without any benefits, sought to pass a paid vacation law in the state.[Evidence

In [10]:
val_features[-1]

'[Claim]:Social media posts with pictures of gas prices and captions urging people to “Never forget January 6th” accurately documented the cost of fuel on the day of the 2021 U.S. Capitol riot.[Evidence]:2022-06-17  ... cost of fuel on the day of the 2021 u.s. capitol riot. conte. ... average price for regular unleaded gasoline on jan. 6, 2021. further, reverse ...[Evidence]:click on your local gas station on the fuel prices map to check out reviews and other information about each location. the average cost of us gas prices might[Evidence]:we try to make sure people have the cheapest price of fuel on the water $500 speedway fuel cards for $1.95 is a scam  yes, u.s. gas prices[Evidence]:6 oct. 2021  if you\'re using social media pictures as evidence, you\'ll need to verify the date they were taken. learn how to accurately date social media ...[Evidence]:6 ott 2021  if you\'re using social media pictures as evidence, you\'ll need to verify the date they were taken. learn how to accurate

In [11]:
train_labels = [fact["label"] for fact in train_data]
val_labels = [fact["label"] for fact in val_data]
train_labels_final = LE.fit_transform(train_labels)
val_labels_final = LE.transform(val_labels)
train_labels_final[:20], val_labels_final[:20]

(array([1, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2]),
 array([1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1]))

In [12]:
input_ids = []
attention_masks = []

for sent in train_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', train_features[0])
print('Token IDs:', input_ids[0])



Original:  [Claim]:Says his political opponents "brought 100,000 protesters into our state."[Evidence]:jul 30, 2015  ... says that isn't a lie: says his political opponents "brought 100,000 protesters into our state." http://www.politifact.com/wisconsin ...[Evidence]:in a national tv interview, gov. scott walker said his political opponents "brought 100000 protesters into our state" during the 2011 demonstrations in ...[Evidence]:19 janv. 2021  a total loser! a loser who sadly plays right into our opponents hands! never a fan,.[Evidence]:31 ago 2023  ... up his political enemies if he is president again. in an ... his business affairs, trump said democrats and other opponents were sick people[Evidence]:jan 28, 2016  a loser who sadly plays right into our opponents hands! never a fan  a total lightweight  opposes me and some of our great republican ...[Evidence]:percent of the vote to clinch the gop nomination. polls had showed mccloskey was running far behind his opponents headed into 

In [13]:
val_input_ids = []
val_attention_masks = []

for sent in val_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    val_input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    val_attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', val_features[0])
print('Token IDs:', val_attention_masks[0])

Original:  [Claim]:Amit Shah said Narendra Modi sleeps for 24 hours for the welfare of the poor.[Evidence]:2022-03-21  'pm modi sleeps for only two hours and works for 22 hours every day,' bjp chief chandrakant patil said.[Evidence]:2017-02-11  obama sleeps nearly five hours a night. narendra modi. pti. 2/2. narendra modi. prime minister narendra modi is said to work 20 hours a day.[Evidence]:mar 23, 2022  he sleeps for 3.5 to 4 hours every day. he sleeps late at around 12 o clock and gets up early around 4 o clock. yet, he is so fresh and energetic throughout the ...[Evidence]:mar 21, 2022  pm modi sleeps for only two hours and works for 22 hours every day. he is experimenting now so that he need not have to sleep," patil ...[Evidence]:sep 26, 2022  brett favre's most memorable stat may be $8 million meant for the poor. his role in the misappropriation of welfare money has infuriated ...[Evidence]:prime minister narendra modi sleeps for only two hours every day and is doing an experim

In [14]:
train_labels_final = torch.tensor(train_labels_final)
val_labels_final = torch.tensor(val_labels_final)

In [15]:
val_labels_final.shape, len(val_input_ids)

(torch.Size([3084]), 3084)

In [16]:
num_classes = len(list(set(train_labels)))
list(set(train_labels)), num_classes

(['Conflicting', 'False', 'True'], 3)

In [17]:
from torch.utils.data import TensorDataset, random_split
# train_poincare_tensor = torch.tensor(poincare_embeddings_final,dtype=torch.float)
# difficulty_tensor = torch.tensor(difficulty_level_vectors,dtype=torch.float)
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, train_labels_final)
val_dataset = TensorDataset(val_input_ids, val_attention_masks,val_labels_final)

In [18]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16
train_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [19]:
from torch import nn
from transformers import DebertaV2ForSequenceClassification
class MultiClassClassifier(nn.Module):
    def __init__(self, bert_model_path, labels_count, hidden_dim=768, mlp_dim=500, extras_dim=100, dropout=0.1, freeze_bert=False):
        super().__init__()

        self.deberta = DebertaV2ForSequenceClassification.from_pretrained(bert_model_path, num_labels=labels_count, problem_type="multi_label_classification")
        # self.softmax = nn.LogSoftmax(dim=1)
        if freeze_bert:
            print("Freezing layers")
            for param in self.roberta.parameters():
                param.requires_grad = False

    def forward(self, tokens, masks):
        output = self.deberta(tokens, attention_mask=masks)
        return output['logits']

In [20]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Loads BertForSequenceClassification, the pretrained BERT model with a single
model = MultiClassClassifier('sileod/deberta-v3-base-tasksource-nli',num_classes, 1024,768,140,dropout=0.1,freeze_bert=False)

# model.load_state_dict(torch.load("model_bert_difficulty_prediction/model_weights"))

# Tell pytorch to run this model on the GPU.
model.cuda()



MultiClassClassifier(
  (deberta): DebertaV2ForSequenceClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): StableDropout()
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-11): 12 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=768, out_features=768, bias=True)
                (key_proj): Linear(in_features=768, out_features=768, bias=True)
                (value_proj): Linear(in_features=768, out_features=768, bias=True)
                (pos_dropout): StableDropout()
                (dropout): StableDropout()
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
 

In [21]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [24]:
# for param in model.deberta.deberta.encoder.layer[0:5].parameters():
#     param.requires_grad=False

In [25]:
import random
import numpy as np
from transformers import get_linear_schedule_with_warmup

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42
epochs = 20

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss_func = nn.CrossEntropyLoss()
# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
early_stopping = EarlyStopping(patience=2, verbose=True)
# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questimport gensim.downloader as api
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        probas = model(b_input_ids,b_input_mask)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        loss = loss_func(probas, b_labels)
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        # scheduler.step()
        logits = probas.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print(" Train Accuracy: {0:.2f}".format(avg_train_accuracy))

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)



    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.

          logits = model(b_input_ids,b_input_mask)

        # Accumulate the validation loss.
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
      print("Early stopping")
      break
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    output_dir = 'model_roberta_large_oracle/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




Training...
  Batch    40  of    621.    Elapsed: 0:00:08.
  Batch    80  of    621.    Elapsed: 0:00:16.
  Batch   120  of    621.    Elapsed: 0:00:24.
  Batch   160  of    621.    Elapsed: 0:00:31.
  Batch   200  of    621.    Elapsed: 0:00:39.
  Batch   240  of    621.    Elapsed: 0:00:46.
  Batch   280  of    621.    Elapsed: 0:00:52.
  Batch   320  of    621.    Elapsed: 0:00:58.
  Batch   360  of    621.    Elapsed: 0:01:05.
  Batch   400  of    621.    Elapsed: 0:01:11.
  Batch   440  of    621.    Elapsed: 0:01:17.
  Batch   480  of    621.    Elapsed: 0:01:23.
  Batch   520  of    621.    Elapsed: 0:01:30.
  Batch   560  of    621.    Elapsed: 0:01:36.
  Batch   600  of    621.    Elapsed: 0:01:42.
 Train Accuracy: 0.62

  Average training loss: 0.83
  Training epcoh took: 0:01:46

Running Validation...
  Accuracy: 0.65
Validation loss decreased (inf --> 0.761092).  Saving model ...
  Validation Loss: 0.76
  Validation took: 0:00:10
Saving model to model_roberta_large_oracle/