#(0) Data Loader
Question_Answer_Dataset_v1.2

In [7]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
# Change to appropriate local file path
%cd '/content/drive/MyDrive/2023-FALL/11-611 NLP/NLP Project Ideas'

/content/drive/.shortcut-targets-by-id/1FH08hbh2r-uOkEjbwC9F2RTvpU9Lxtv7/NLP Project Ideas


In [9]:
import torch
from torch.utils.data import Dataset
import os
import numpy as np

QA dataset line:
- [0] article_name   
- [1] question   
- [2] answer  
- [3] difficulty by question maker
- [4] difficulty by answer maker
- [5] article path
- [6] article text

In [10]:
class CustomData(Dataset):

    def __init__(self, file_dir):
        self.file = file_dir
        self.article_name = []
        self.questions = []
        self.answers = []
        self.q_diffi = []
        self.a_diffi = []
        self.article_path = []
        self.context = {} # only fill when load the dataset

        # get question answer pairs
        for div in ['S08', 'S09', 'S10']:
          skip = True
          qa_path = os.path.join(self.file, div, "question_answer_pairs.txt")
          with open(qa_path, 'rb') as f:
            for line in f:
              if skip:
                skip = False
                continue
              try:
                row = line.decode().split('\t')
              except:
                continue
              # print(row)
              if "NULL" in row:
                continue # if any feature does not exist -> skip
              self.article_name.append(row[0])
              self.questions.append(row[1])
              self.answers.append(row[2])
              self.q_diffi.append(row[3])
              self.a_diffi.append(row[4])
              self.article_path.append(div + "/"+ row[5][:-1]) # get rid of '\n

        print("length of dataset: ", len(self.questions))


    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):

        # retrieve context here -> less mem storage overhead
        try:
          curr_context = self.context[self.article_name[idx]]
        except KeyError:
          context_file = self.file + "/" + self.article_path[idx] + ".txt"
          # read all content, including the related items
          with open(context_file, 'rb') as f:
            curr_context = f.read().decode().replace('\n',' ')
          self.context[self.article_name[idx]] = curr_context

        #return self.questions[idx], self.answers[idx], curr_context
        return (self.article_name[idx],
                self.questions[idx],
                self.answers[idx],
                self.q_diffi[idx],
                self.a_diffi[idx],
                self.article_path[idx],
                curr_context
                )

In [11]:
# Load data using CustomData function
data_path = 'Question_Answer_Dataset_v1.2'
dataset = CustomData(data_path)

length of dataset:  2725


In [12]:
# Print random set for data exploration
n=len(dataset)
n_show=5

rand_range=np.random.randint(0,n,n_show)

for line in rand_range:
  d = dataset[line] #
  print('line:',line)
  print("article_name: ", d[0])
  print("question: ", d[1])
  print(' question length:',len(d[1]))
  print("answer: ", d[2])
  print(' answer length:', len(d[2]))
  print("q_diffi: ", d[3])
  print("a_diffi: ", d[4])
  print("article_path: ", d[5])
  print("article: ", d[6])
  print(' article length:',len(d[6]))
  print('\n')

line: 1944
article_name:  Cougar
question:  On average, are cougar males heavier than females?
 question length: 50
answer:  On average, cougar males are heavier than females.
 answer length: 50
q_diffi:  hard
a_diffi:  hard
article_path:  S10/data/set1/a2
 article length: 34165


line: 751
article_name:  otter
question:  Can otters survive in cold water?
 question length: 33
answer:  yes
 answer length: 3
q_diffi:  easy
a_diffi:  medium
article_path:  S08/data/set1/a7
article:  otter    Otters are amphibious (or in one case aquatic) carnivorous mammals.  The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution.  An otter's den is called a holt.  Male otters are dog-otters, females are bitches and babies are cubs or pups.  The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature.   

#(1) Question Generation

##(1A) AMontgomerie
https://github.com/AMontgomerie/question_generator/blob/master/examples/question_generation_example.ipynb

In [15]:
# It may be necessary to uninstall first
#!pip uninstall transformers

Found existing installation: transformers 4.35.0
Uninstalling transformers-4.35.0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.35.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.35.0


In [16]:
!pip install --no-cache-dir transformers sentencepiece

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.35.0


In [6]:
#!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m120.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
Co

In [1]:
!git clone https://github.com/amontgomerie/question_generator/

fatal: destination path 'question_generator' already exists and is not an empty directory.


In [2]:
%cd question_generator/
%load questiongenerator.py
from questiongenerator import QuestionGenerator
from questiongenerator import print_qa

/content/question_generator


In [3]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

In [19]:
qg = QuestionGenerator()

#Original example

#with open('articles/indian_matchmaking.txt', 'r') as a:
#    article = a.read()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Original example

#qa_list = qg.generate(
#    article,
#    num_questions=10,
#    answer_style='all'
#)
#print_qa(qa_list)

In [20]:
# Choose article
line=751
d = dataset[line] #
article = d[6]

print(article)

otter    Otters are amphibious (or in one case aquatic) carnivorous mammals.  The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution.  An otter's den is called a holt.  Male otters are dog-otters, females are bitches and babies are cubs or pups.  The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature.     Otters have long, slim bodies and relatively short limbs, with webbed paws. Most have sharp claws on their feet, and all but the sea otter have long muscular tails.  They have a very soft underfur which is protected by their outer layer of long guard hair. This traps a layer of air, and keeps them dry and warm under water.     Otters do not depend on their specialized fur alone for survival in the cold waters where many live: they also have very high metabolic rates. For exampl

In [22]:
qa_list = qg.generate(
    article,
    num_questions=5,
    answer_style='sentences'
)
print_qa(qa_list)

Generating questions...

Evaluating QA pairs...

1) Q: How much of their body weight must they eat?
   A: For example Eurasian otters must eat 15% of their body-weight a day, and sea otters, 20 to 25%, depending on the temperature.

2) Q: What is the meaning of otters?
   A: In some Native American cultures, otters are considered totem animals.

3) Q: What is the name of the otter?
   A: An otter in Southwold, Suffolk, England Genus Lutra *Eurasian otter (Lutra lutra) *Hairy-nosed otter (Lutra sumatrana) *Lutra bravardi *Lutra libyca *Lutra palaeindica *Lutra simplicidens Genus Hydrictis *Speckle-throated otter (Hydrictis maculicollis) Genus Lutrogale *Smooth-coated otter (Lutrogale perspicillata) Genus Lontra *Northern river otter (Lontra canadensis) *Southern river otter (Lontra provocax) *Neotropical river otter (Lontra longicaudis) *Marine otter (Lontra felina) Genus Pteronura *Giant otter (Pteronura brasiliensis) Genus Aonyx *African clawless otter (Aonyx capensis) *Congo clawless

#(2) Answer generation

## (2A) BertForQuestionAnswering

In [65]:
#!pip install transformers
#!pip install torch



In [27]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
#import wikipediaapi

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
model_name = "deepset/bert-base-cased-squad2"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)
model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [31]:
def split_into_paragraphs(text, max_length=400):
    """
    Splits the text into paragraphs and returns a list of paragraphs.
    Each paragraph's token length is limited to max_length.
    """
    paragraphs = [p for p in text.split("\n") if p]

    short_paragraphs = []
    for paragraph in paragraphs:
        tokens = tokenizer.tokenize(paragraph)
        start_token = 0
        while start_token < len(tokens):
            end_token = min(start_token + max_length, len(tokens))
            short_paragraph = tokens[start_token:end_token]
            short_paragraphs.append(tokenizer.convert_tokens_to_string(short_paragraph))
            start_token = end_token

    return short_paragraphs

In [32]:
'''
 Very naive implementation using paragraph segmentation. Might lead to performance bottleneck
 if the article is extremely long. I experimented with the Pittsburgh wikipedia article:
 https://en.wikipedia.org/wiki/Pittsburgh . The performance seemed acceptable if it's run on a GPU.
 '''
def answer_from_article(article, question, overlap = 10):
    paragraphs = split_into_paragraphs(article)

    # Iterate through each paragraph to find the best answer
    max_score = -float('inf')
    best_answer = ""
    for paragraph in paragraphs:
        inputs = tokenizer.encode_plus(question, paragraph, return_tensors="pt")
        inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
        outputs = model(**inputs)
        start_score = torch.max(outputs.start_logits)
        end_score = torch.max(outputs.end_logits)

        # Aggregate start and end scores
        score = start_score + end_score

        if score > max_score:
            max_score = score
            answer_start = torch.argmax(outputs.start_logits)
            answer_end = torch.argmax(outputs.end_logits)
            input_ids = inputs["input_ids"].tolist()[0]
            best_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end+1]))

    return best_answer

In [33]:
# Choose article for application
line=751 #line of dataset
article = d[6]
question = d[1]

print(article)
print(question)

otter    Otters are amphibious (or in one case aquatic) carnivorous mammals.  The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution.  An otter's den is called a holt.  Male otters are dog-otters, females are bitches and babies are cubs or pups.  The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature.     Otters have long, slim bodies and relatively short limbs, with webbed paws. Most have sharp claws on their feet, and all but the sea otter have long muscular tails.  They have a very soft underfur which is protected by their outer layer of long guard hair. This traps a layer of air, and keeps them dry and warm under water.     Otters do not depend on their specialized fur alone for survival in the cold waters where many live: they also have very high metabolic rates. For exampl

In [34]:
answer = answer_from_article(article, question)
print(answer)

[CLS]


## (2B) Tiny_Roberta

In [73]:
#!pip install transformers



In [35]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/tinyroberta-squad2"

In [36]:
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/835 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/326M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [37]:
# Choose article for application
line=751 #line of dataset
article = d[6]
question = d[1]

print(article)
print(question)

otter    Otters are amphibious (or in one case aquatic) carnivorous mammals.  The otter subfamily Lutrinae forms part of the family Mustelidae, which also includes weasels, polecats, badgers, as well as others. With 13 species in 7 genera, otters have an almost worldwide distribution.  An otter's den is called a holt.  Male otters are dog-otters, females are bitches and babies are cubs or pups.  The collective noun romp is sometimes used for a group of otters, being descriptive of their often playful nature.     Otters have long, slim bodies and relatively short limbs, with webbed paws. Most have sharp claws on their feet, and all but the sea otter have long muscular tails.  They have a very soft underfur which is protected by their outer layer of long guard hair. This traps a layer of air, and keeps them dry and warm under water.     Otters do not depend on their specialized fur alone for survival in the cold waters where many live: they also have very high metabolic rates. For exampl

In [38]:
QA_input = {
    'question': question,
    'context': article
}
res = nlp(QA_input)

In [39]:
res

{'score': 0.24299481511116028,
 'start': 847,
 'end': 898,
 'answer': 'Otters do not depend on their specialized fur alone'}