In [1]:
import os
import torch
import logging
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
model_dir = "sileod/deberta-v3-base-tasksource-nli"
tokenizer_dir = "sileod/deberta-v3-base-tasksource-nli"
model_type = "bert"

input_dimension = 768
mlp_dim = 500
dropout = 0.1
freeze = False
vocal = False
small_dataset = False
epochs = 20
batch_size = 16



if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
batch_size = batch_size if not small_dataset else 1

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4090


In [2]:
from transformers import AutoTokenizer,AutoModel, BertTokenizer
import json

# Load the BART tokenizer.
print(f'Loading {tokenizer_dir} tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, do_lower_case=True)

Loading sileod/deberta-v3-base-tasksource-nli tokenizer...




In [3]:
import json

# Changed from NumTemp-E9C0/output/bm25_top_100_train
with open("../NumTemp-E9C0/output/programfc_bm25_top_100_train_reordered.json") as f:
    train_data = json.load(f)
with open('../NumTemp-E9C0/output/programfc_bm25_top_100_val_reordered.json') as f:
    val_data = json.load(f)
with open('../NumTemp-E9C0/output/programfc_bm25_top_100_test_reordered.json') as f:
    test_data = json.load(f)
    
if small_dataset:
    train_data = train_data[:10]
    val_data = val_data[:10]
    test_data = test_data[:10]
    
len(train_data), len(val_data), len(test_data)

(9935, 3084, 2495)

In [4]:
if vocal:
    train_data[-1]

In [5]:
if vocal:
    val_data[-1]

In [6]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
k = 100

In [7]:
def get_features(data):
    features = []
    evidences = []
    
    for index, fact in enumerate(data):
        claim = fact["claim"]
        feature = '[Claim:]' + claim 
        if 'predicted_programs' in fact:
            programs = fact['predicted_programs'][0]
            programs = ' '.join(programs)
            feature += '[Program:]' + programs
        evidences = fact['top_n'][:k]
        evidences = ''.join(f'[Evidence]:{e}' for e in evidences)
        feature += evidences
        features.append(feature)
    return features

In [8]:
train_features = get_features(train_data)
val_features = get_features(val_data)
test_features = get_features(test_data)
len(train_features), len(val_features), len(test_features)

(9935, 3084, 2495)

In [9]:
print(train_features[-1])
if vocal:
    train_features[-1]

[Claim:]"For perspective, Special Session costs you $50k, every single day. That’s roughly the average teacher’s salary in NC."[Program:]fact_1 = Verify("Special Session costs $50k every single day.") fact_2 = Verify("The average teacher's salary in NC is roughly $50k.") label = Predict(fact_1 and fact_2)[Evidence]:13 sept. 2018  nationwide, the estimated average public-school teacher's salary is now $58,950, according to the national center for education statisticsa ...[Evidence]:average weight of students and teacher = 160 kg. formula used: average = ( sum of all observations / total number of observations ). calculations: as per the ...[Evidence]:the average weight of 13 students and their teacher = 24.5 kg.  total weight of 13 students and their teacher = 24.5  (13 + 1) = 343 kg.[Evidence]:other state's crime lab average costs and completed cases/kits. state average laboratory processing time = 11 days. average quality review[Evidence]:in 39 states, the average teacher's salary dec

In [10]:
if vocal:
    val_features[-1]

In [11]:
if vocal:
    test_features[-1]

In [12]:
train_labels = [fact["gold"] for fact in train_data]
val_labels = [fact["gold"] for fact in val_data]
test_labels = [fact["gold"] for fact in test_data]

train_labels_final = LE.fit_transform(train_labels)
val_labels_final = LE.transform(val_labels)
test_labels_final = LE.transform(test_labels)
train_labels_final[:20], val_labels_final[:20], test_labels_final[:20]

(array([2, 1, 2, 1, 1, 2, 2, 1, 1, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 2]),
 array([2, 0, 2, 1, 2, 0, 1, 1, 1, 1, 1, 1, 2, 2, 0, 0, 1, 1, 1, 0]))

In [13]:
input_ids = []
attention_masks = []

# needed for gpt2. Added by Emil
if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
# temp change Emil
for sent in train_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
if vocal:
    print('Original: ', train_features[0])
    print('Token IDs:', input_ids[0])
print("done")



done


In [14]:
val_input_ids = []
val_attention_masks = []
# temp change Emil
for sent in val_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    val_input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    val_attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
if vocal:
    print('Original: ', val_features[0])
    print('Token IDs:', val_attention_masks[0])
print("done")

done


In [15]:
test_input_ids = []
test_attention_masks = []
# temp change Emil
for sent in test_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    test_input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    test_attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
if vocal:
    print('Original: ', test_features[0])
    print('Token IDs:', test_attention_masks[0])
print("done")


done


In [16]:
# temp change Emil
train_labels_final = torch.tensor(train_labels_final)
val_labels_final = torch.tensor(val_labels_final)
test_labels_final = torch.tensor(test_labels_final)

In [17]:
val_labels_final.shape, len(val_input_ids)

(torch.Size([3084]), 3084)

In [18]:
num_classes = len(list(set(train_labels)))
list(set(train_labels)), num_classes

(['False', 'True', 'Conflicting'], 3)

In [19]:
from torch.utils.data import TensorDataset, random_split
# train_poincare_tensor = torch.tensor(poincare_embeddings_final,dtype=torch.float)
# difficulty_tensor = torch.tensor(difficulty_level_vectors,dtype=torch.float)
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, train_labels_final)
val_dataset = TensorDataset(val_input_ids, val_attention_masks,val_labels_final)
test_dataset = TensorDataset(test_input_ids, test_attention_masks,test_labels_final)

In [20]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

In [21]:
from torch import nn
class MultiClassClassifier(nn.Module):
    def __init__(self, inner_model_path, labels_count, hidden_dim=768, mlp_dim=500, extras_dim=100, dropout=0.1, freeze_inner_model=False):
        super().__init__()

        self.inner_model = AutoModel.from_pretrained(inner_model_path,output_hidden_states=True,output_attentions=True)
        self.dropout = nn.Dropout(dropout)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, mlp_dim),
            nn.ReLU(),
            # nn.Linear(mlp_dim, mlp_dim),
            # # nn.ReLU(),
            # # nn.Linear(mlp_dim, mlp_dim),
            # nn.ReLU(),
            nn.Linear(mlp_dim, labels_count)
        )
        # self.softmax = nn.LogSoftmax(dim=1)
        # TODO, make such that possible with different models
        if freeze_inner_model:
            print("Freezing layers")
            for param in self.inner_model.parameters():
                param.requires_grad = False

    def forward(self, tokens, masks):
        output = self.inner_model(tokens, attention_mask=masks)
        # bart-large-mnli doesn't have "pooler_output" like BERT and RoBERTa do
        # dropout_output = self.dropout(output["pooler_output"])
        last_hidden_state = output.last_hidden_state  # Get the last hidden state
        class_token = last_hidden_state[:, 0, :]  # Get the class token
        mlp_output = self.dropout(class_token)
        mlp_output = self.mlp(mlp_output)
        # concat_output = torch.cat((dropout_output, topic_emb), dim=1)
        # concat_output = self.dropout(concat_output)
        # mlp_output = self.mlp(dropout_output)
        # proba = self.sigmoid(mlp_output)
        # proba = self.softmax(mlp_output)

        return mlp_output

In [22]:
from transformers import AutoModelForSequenceClassification, AdamW, BertConfig

# Loads AutoModelForSequenceClassification, the pretrained BART model with a single
model = MultiClassClassifier(model_dir,num_classes, input_dimension,mlp_dim,140,dropout=dropout,freeze_inner_model=False)

# model.load_state_dict(torch.load("model_bert_difficulty_prediction/model_weights"))

# Tell pytorch to run this model on the GPU.
model.cuda()



MultiClassClassifier(
  (inner_model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
   

In [23]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [25]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [26]:

if freeze:
    if model_type == "gpt":
        for param in model.inner_model.h[:-1].parameters():
            param.requires_grad=False
    elif model_type == "bert":
        for param in model.inner_model.encoder.layer[:-1].parameters():
            param.requires_grad = False
    else:
        for param in model.inner_model.layers[:-1].parameters():
            param.requires_grad=False


In [27]:
import random
import numpy as np
from transformers import get_linear_schedule_with_warmup

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss_func = nn.CrossEntropyLoss()
# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
early_stopping = EarlyStopping(patience=2, verbose=True)
# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_accuracy = 0
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questimport gensim.downloader as api
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        probas = model(b_input_ids,b_input_mask)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        loss = loss_func(probas, b_labels)
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        # scheduler.step()
        logits = probas.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print(" Train Accuracy: {0:.2f}".format(avg_train_accuracy))

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)



    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)

        b_input_mask = batch[1].to(device)
        # b_poincare = batch[2].to(device)
        # b_difficulty = batch[3].to(device)
        b_labels = batch[2].to(device)
        # skill_labels = batch[3].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.

            logits = model(b_input_ids,b_input_mask)

        # Accumulate the validation loss.
        loss = loss_func(logits, b_labels)
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    output_dir = 'model_bart_large_oracle/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.save(model.state_dict(), os.path.join(output_dir, 'model_weights'))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




Training...
  Batch    40  of    621.    Elapsed: 0:00:06.
  Batch    80  of    621.    Elapsed: 0:00:12.
  Batch   120  of    621.    Elapsed: 0:00:18.
  Batch   160  of    621.    Elapsed: 0:00:24.
  Batch   200  of    621.    Elapsed: 0:00:30.
  Batch   240  of    621.    Elapsed: 0:00:36.
  Batch   280  of    621.    Elapsed: 0:00:42.
  Batch   320  of    621.    Elapsed: 0:00:48.
  Batch   360  of    621.    Elapsed: 0:00:54.
  Batch   400  of    621.    Elapsed: 0:01:00.
  Batch   440  of    621.    Elapsed: 0:01:06.
  Batch   480  of    621.    Elapsed: 0:01:14.
  Batch   520  of    621.    Elapsed: 0:01:20.
  Batch   560  of    621.    Elapsed: 0:01:26.
  Batch   600  of    621.    Elapsed: 0:01:32.
 Train Accuracy: 0.62

  Average training loss: 0.83
  Training epcoh took: 0:01:36

Running Validation...
  Accuracy: 0.63
Validation loss decreased (inf --> 0.798637).  Saving model ...
  Validation Loss: 0.80
  Validation took: 0:00:10
Saving model to model_bart_large_oracle/

T

In [28]:
print("testing")

# Tracking variables
total_test_accuracy = 0
total_test_loss = 0
nb_test_steps = 0

all_predictions = []

# Evaluate data for one epoch
for batch in test_dataloader:

    # Unpack this training batch from our dataloader.
    #
    # As we unpack the batch, we'll also copy each tensor to the GPU using
    # the `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids
    #   [1]: attention masks
    #   [2]: labels
    b_input_ids = batch[0].to(device)

    b_input_mask = batch[1].to(device)
    # b_poincare = batch[2].to(device)
    # b_difficulty = batch[3].to(device)
    b_labels = batch[2].to(device)
    # skill_labels = batch[3].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():

        # Forward pass, calculate logit predictions.
        logits = model(b_input_ids,b_input_mask)

    # Accumulate the validation loss.
    loss = loss_func(logits, b_labels)
    total_test_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_test_accuracy += flat_accuracy(logits, label_ids)
    
    predictions = np.argmax(logits, axis=1)
    all_predictions.extend(predictions.tolist())


# Report the final accuracy for this validation run.
avg_val_accuracy = total_test_accuracy / len(test_dataloader)
print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

# Calculate the average loss over all of the batches.
avg_val_loss = total_test_loss / len(test_dataloader)
# Measure how long the validation run took.

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
import csv
with open('predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the list as a single row in the CSV
    writer.writerow(all_predictions)


testing
  Accuracy: 0.63
  Validation Loss: 0.89
