In [None]:
!pip install --upgrade pip
!pip install datasets evaluate
!pip install --upgrade accelerate
#!pip install transformers==4.28.0
!pip install -U transformers
!pip install --upgrade huggingface_hub
!pip install tasknet tasksource
!pip install conllu

In [None]:
from google.colab import drive
from transformers import BertTokenizer, AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.utils import logging
from tasknet import Adapter
import tasksource
from transformers import AutoModel, AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer
import os
from datasets import load_dataset, Dataset, load_from_disk
import datasets
from huggingface_hub import login
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
import evaluate
import numpy as np
from typing import Optional, Union
import torch
import json
import pandas as pd
import random
import itertools

In [None]:
# login to huggingface hub
login(token="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")

# NOTE: this part is not needed if not running on collab
drive.mount('/content/drive')

# NOTE: navigate to the folder with dataset
folder = '/content/drive/My Drive/CS4NLP'
os.chdir(folder)

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("CUDA is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU")

In [None]:
# install sentence transformers and get one model
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util

sentembb_model = SentenceTransformer('all-MiniLM-L6-v2',  device='cuda') # sentence embedding model https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
#from advanced_retrieval import sentence_embedding_cut
from baseline_models import *

In [None]:
ds_train = load_from_disk('datasets/quality/train')
ds_dev = load_from_disk('datasets/quality/dev')

print(f"size of ds_train:{len(ds_train)}")
print(f"size of ds_dev:{len(ds_dev)}")
# the labels start at 1 and not 0
options = []
print(ds_train[0])
for row in ds_dev:
  options.append(row["gold_label"])
print(np.unique(options))

In [None]:
# retrieval method

def sentence_embedding_cut(article, tokenizer, query, MAX_TOKENS = 512, extra_length = 0, *args, **kwargs):
    MAX_TOKENS = MAX_TOKENS - extra_length

    sentences = article.split(". ")
    sentences = sentences[:-1] if not sentences[-1].strip() else sentences
    sentences = [sentence if sentence.endswith(".") else sentence + "." for sentence in sentences]


    query_embedding = sentembb_model.encode(query)

    batch_size = 500
    num_sentences = len(sentences)
    num_batches = int(np.ceil(num_sentences / batch_size))

    similarity_scores = []

    # get all passage embeddings in batches
    for i in range(num_batches):
      start_index = i * batch_size
      end_index = min((i + 1) * batch_size, num_sentences)

      # Get the batch of sentences
      batch_sentences = sentences[start_index:end_index]

      # Encode the batch of sentences into embeddings
      batch_embeddings = sentembb_model.encode(batch_sentences)

      # Append the batch embeddings to the list
      #passage_embedding.extend(batch_embeddings)

      similarity = util.cos_sim(query_embedding, batch_embeddings).numpy()[0]
      similarity_scores.extend(similarity)
    #passage_embedding = sentembb_model.encode(sentences)

    #similarity = util.cos_sim(query_embedding, passage_embedding).numpy()[0]
    #print("Similarity:", similarity_scores)

    result = list(zip(range(0, len(sentences)), similarity_scores))

    # sort them by similarity score
    sentences_sortby_similarity = sorted(result, key=lambda x: x[1], reverse=True)
    #print(sentences_sortby_similarity)


    selected_sentences = []
    total_tokens = 0

    for (sentence_idx, similarity) in sentences_sortby_similarity:
        tokens = tokenizer.tokenize(sentences[sentence_idx])
        num_tokens = len(tokens)
        if total_tokens == MAX_TOKENS:
            break
        elif (total_tokens + num_tokens) <= MAX_TOKENS:
            selected_sentences.append(sentence_idx)
            total_tokens += num_tokens
        else:
            break


    # use the senteces in the original order
    selected_sentences.sort()
    selected_sentences = [sentences[i] for i in selected_sentences]

    return " ".join(selected_sentences)

In [None]:
model = RobertaLarge()
max_length = model.get_max_seq_length()
tokenizer = model.get_tokenizer()

In [None]:
print(max_length)

In [None]:
# create csv file, with cut context by sentence embeddings
ds_sentembb_train_path = 'datasets/sentembb_roberta/train'
ds_sentembb_dev_path = 'datasets/sentembb_roberta/dev'

if not os.path.exists(ds_sentembb_dev_path):
  ds_sentembb_dict = {'cut_article': [], 'question': [], 'options' : [], 'label': []}
  ctr = 0
  total_len = len(ds_train)
  for item in ds_train:
    # article
    article = item["article"]
    question = item["question"]
    options = item["options"]
    label = item["gold_label"] - 1 # labels start at 1

    extra_length = model.get_extra_input_length(question=question, options=options)

    cut_article = sentence_embedding_cut(article=article, tokenizer=tokenizer, MAX_TOKENS=max_length, query=question, extra_length = extra_length)

    #cut_item = {'cut_article': cut_article, 'question': question, 'options' : options, 'label': label}
    ds_sentembb_dict['cut_article'].append(cut_article)
    ds_sentembb_dict['question'].append(question)
    ds_sentembb_dict['options'].append(options)
    ds_sentembb_dict['label'].append(label)
    ctr+=1
    if (ctr) % 100 == 0:
      print(f"{ctr}/{total_len}")


  # save new dataset
  new_dataset = Dataset.from_dict(ds_sentembb_dict)
  os.makedirs(ds_sentembb_dev_path, exist_ok=True)

  new_dataset.save_to_disk(ds_sentembb_dev_path)


In [None]:
#item = ds_dev[0]
'''
item = {'article' : "Thomas went to the restroom. Abby was in the kitchen. Melinda set the table.", 'question' : 'Who made the burger?', 'options': ['Melinda', 'Thomas', 'Abby', 'Fritz'], 'gold_label': 3}

ds_sentembb_train_path = 'datasets/sentembb_longf/train'
ds_sentembb_dev_path = 'datasets/sentembb_longf/dev'
ds_sentembb_train_dict = {'cut_article': [], 'question': [], 'options' : [], 'label': []}

if not os.path.exists(ds_sentembb_dev_path):
  # article
  article = item["article"]
  question = item["question"]
  options = item["options"]
  label = item["gold_label"] - 1 # labels start at 1

  extra_length = model.get_extra_input_length(question=question, options=options)

  cut_article = sentence_embedding_cut(article=article, tokenizer=tokenizer, MAX_TOKENS=max_length, query=question, extra_length = extra_length)

  #cut_item = {'cut_article': cut_article, 'question': question, 'options' : options, 'label': label}
  ds_sentembb_train_dict['cut_article'].append(cut_article)
  ds_sentembb_train_dict['question'].append(question)
  ds_sentembb_train_dict['options'].append(options)
  ds_sentembb_train_dict['label'].append(label)

  #ds_sentembb_train.append(cut_item)

  #print(type(ds_sentembb_train_dict))

  new_dataset = Dataset.from_dict(ds_sentembb_train_dict)
  os.makedirs(ds_sentembb_train_path, exist_ok=True)

  new_dataset.save_to_disk(ds_sentembb_dev_path)
else:
  print('folder exists already')
  '''

In [None]:
#print(ds_dev[0])
#print(len(ds_dev[0]['article']))

test = load_from_disk(ds_sentembb_dev_path)
print(test[0])
print(len(test[0]['cut_article']))
print(test[0]['question'])

In [None]:
# TODO: run from here
# init model
model = Longformer() # change to Roberta/Transformer
device = model.device
print(device)

In [None]:
model_name = 'Longformer' # output
retrieval = 'Sentence_Embeddings' #for output
ds_sentembb_dev_path = 'datasets/sentembb_longformer/dev'

accuracy = evaluate.load("accuracy")
df = pd.DataFrame(columns=["preprocessor", "model", "accuracy"])

ds_sentembb_dev = load_from_disk(ds_sentembb_dev_path)

references = []
predictions = []
ctr = 0
#max_length = model.get_max_seq_length()
#tokenizer = model.get_tokenizer()

for item in ds_sentembb_dev:
  # parse dataset
  article = item["cut_article"]
  question = item["question"]
  options = item["options"]
  label = item["label"]

  prediction = model.predict(context=article, question=question, options=options)

  #prediction = 1
  # metrics
  predictions.append(prediction)
  references.append(label)

  ctr+=1
  if (ctr) % 100 == 0:
    print(f"{ctr}/{len(ds_sentembb_dev)}")


# save performance of one configuration
metric = accuracy.compute(references=references, predictions=predictions)
row = np.array((retrieval, model_name, metric), ndmin=2)
df = pd.concat([df, pd.DataFrame(row, columns=["preprocessor", "model", "accuracy"])], axis = 0)
pred_df = pd.DataFrame({"Prediction": predictions, "Label": references})


df.to_csv(retrieval+"_"+model_name+".csv", index=False)
pred_df.to_csv(retrieval+"_"+model_name+"_predictions_roberta.csv", index=False)

In [None]:
'''import torch

# Move the model to the CUDA device
model.model.to('cuda:0')
print(model.device)
# Assuming `context`, `question`, and `options` are non-tensor inputs
context = "William is hungry."
question = "Who ate the potato?"
options = ["William", "James", "Feb", "March"]


# Perform prediction0
predictions = model.predict(context=context, question=question, options=options)
print(predictions)
#inputs = model.prepare_answering_input(question=question, options=options, context=context).to(model.device)
#outputs = model.model(**inputs)
#print(outputs)'''