## LLM

In [26]:
# set path

import sys
sys.path.insert(0, '../src/')

import matplotlib.pyplot as plt
from tokenizer import TokenizerBPE
from data_handling import normalize_to_ascii
import numpy as np
import re

import os
import tensorflow as tf
import pickle as pkl
from tqdm.notebook import tqdm
import json

# disable gpu for testing purposes
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


## Trivia Web

In [8]:
def get_top_n_evidence(n, rank, evidence_article):
    top_n = []
    for i in range(n):
        article = evidence_article[rank[i][0]]
        top_n.append(article)
    return top_n

In [10]:
with open('../corpus/triviaqa-rc/qa/web-train.json', 'r', encoding='utf-8') as f:
    squad = json.load(f)

question_list = []
answer_list = []
evidence_list = []

for qa in tqdm(squad['Data']):
    question_list.append(qa['Question'])
    answer_list.append(qa['Answer']["Value"])
    sr = qa['SearchResults']
    if len(sr) > 0: 
        evidence_list.append(sr[0]["Filename"])
    else:
        evidence_list.append(None)

  0%|          | 0/76496 [00:00<?, ?it/s]

In [12]:
path = "../corpus/triviaqa-rc/evidence/web/"

evidence_article = {}
seen = {}

for i, evidence in tqdm(list(enumerate(evidence_list))):
    if evidence is not None:
        evidence = evidence.replace(":", "_").replace("?", "_").replace("*", "_").replace('"', "_")
        if not evidence in seen:
            seen[evidence] = 1
            try:
                txt = open(path + evidence, 'r', encoding='utf-8').read()
                evidence_article[evidence] = txt
            except:
                print("Error reading file: ", evidence)
                continue
        else:
            seen[evidence] += 1

  0%|          | 0/76496 [00:00<?, ?it/s]

In [13]:
rank = [[evidence, freq] for freq, evidence in sorted(zip(seen.values(), seen.keys()), reverse=True)]

article_list = get_top_n_evidence(15000, rank, evidence_article)

print(len("".join(article_list)))
print(len(rank))

233689069
64719


In [28]:
sys.path.insert(0, '../')
tokenizer = pkl.load(open("../tokenizers/tokenizer_superQA_24k.pkl", "rb"))
tokenizer.create_hash()
tokenizer.add_special_tokens(["<s>", "</s>", "<q>", "<a>", "<pad>"])

In [30]:
def fused_article(corpus_list):
    sos = "<s>"
    eos = "</s>"

    rcw = re.compile(r"\s+")

    corpus_padded_list = []
    for line in corpus_list:
        line = line.replace("\n", " ").replace("\r", " ")
        line = rcw.sub(" ", line).strip()
        line = [sos, normalize_to_ascii(line).lower(), eos]
        corpus_padded_list.extend(line)
    
    return "".join(corpus_padded_list)

In [31]:
fused = fused_article(article_list)
corpus_encoded = tokenizer.encode(fused.lower(), verbose=True)

  0%|          | 0/24000 [00:00<?, ?it/s]

In [32]:
pkl.dump(corpus_encoded, open('../corpus/corpus_clean/corpus_web_article_24k', 'wb'))

## QA

In [36]:
def fused_qa(question_list, answer_list):
    q ="<q>"
    a = "<a>"
    sos = "<s>"
    eos = "</s>"
    rcw = re.compile(r"\s+")

    corpus_list = []
    for question, answer in tqdm(list(zip(question_list, answer_list))):
        question = question.replace("\n", " ").replace("\r", " ")
        question = rcw.sub(" ", question).strip()
        answer = answer.replace("\n", " ").replace("\r", " ")
        answer = rcw.sub(" ", answer).strip()
        qa = [sos, q, normalize_to_ascii(question), a, normalize_to_ascii(answer), eos]
        corpus_list.extend(qa)
        
    return "".join(corpus_list).lower()

In [37]:
fused = fused_qa(question_list, answer_list)
corpus_encoded = tokenizer.encode(fused, verbose=True)

pkl.dump(corpus_encoded, open('../corpus/corpus_clean/corpus_web_qa_24k', 'wb'))

  0%|          | 0/76496 [00:00<?, ?it/s]

  0%|          | 0/24000 [00:00<?, ?it/s]

## 