## LLM

In [1]:
import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
from src.tokenizer import TokenizerBPE, word_split, normalize_to_ascii, pair_freq


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.data_handling import read_first_n

# disable gpu for testing purposes
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [2]:
import json

In [3]:
with open('corpus/train-v2.0.json', 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [4]:
question_list = []
answer_list = []

for article in squad['data']:
    title = article['title']
    print(f"Title: {title}")
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            if not qa['is_impossible']:
                question = qa['question']
                answer = qa['answers']
                if title == "The_Legend_of_Zelda:_Twilight_Princess":
                    print(f"Question: {question}")
                    print(f"Answer: {answer[0]['text']}")

                question_list.append(question)
                answer_list.append(answer[0]['text'])
                    

Title: Beyoncé
Title: Frédéric_Chopin
Title: Sino-Tibetan_relations_during_the_Ming_dynasty
Title: IPod
Title: The_Legend_of_Zelda:_Twilight_Princess
Question: What category of game is Legend of Zelda: Twilight Princess?
Answer: action-adventure
Question: What consoles can be used to play Twilight Princess?
Answer: GameCube and Wii
Question: When was Twilight Princess launched in North America?
Answer: November 2006
Question: When could GameCube owners purchase Twilight Princess?
Answer: December 2006
Question: What company developed Legend of Zelda: Twilight Princess?
Answer: Nintendo
Question: What year was the Legend of Zelda:Twilight Princess originally planned for release?
Answer: 2005
Question: What year was the Wii version of Legend of Zelda: Twilight Princess released?
Answer: 2006
Question: Who is the main character of the story?
Answer: Link
Question: What land does Link serve to protect?
Answer: Hyrule
Question: What character helped Link in Twilight Princess?
Answer: Midna


In [5]:
def fused_qa(question_list, answer_list, tokenizer):
    q = tf.convert_to_tensor([[tokenizer.token_to_idx["<q>"]]])
    a = tf.convert_to_tensor([[tokenizer.token_to_idx["<a>"]]])
    sos = tf.convert_to_tensor([[tokenizer.token_to_idx["<s>"]]])
    eos = tf.convert_to_tensor([[tokenizer.token_to_idx["</s>"]]])

    corpus_list = []
    for question, answer in tqdm(list(zip(question_list, answer_list))):
        q_tokens = tokenizer.tokenize(question)
        a_tokens = tokenizer.tokenize(answer)
        corpus_list.extend([sos, q, q_tokens, a, a_tokens, eos])
    
    corpus = tf.concat(corpus_list, axis=1)
    return corpus
        

In [None]:
corpus = question_list + answer_list
tokenizer = TokenizerBPE(corpus, 16000, lowercase=True)
tokenizer.destroy_hash()
pkl.dump(tokenizer, open("tokenizers/tokenizer_QA16000.pkl", "wb"))


Lowercasing corpus


  0%|          | 0/173642 [00:00<?, ?it/s]

Merging tokens


  0%|          | 0/16000 [00:00<?, ?it/s]

t h 121353
a t 104386
i n 100775
th e 91306
a n 84928
e r 83116
w h 80652
o n 76529
r e 64615
e n 61465
s t 51844
wh at 50619
o r 50008
a l 48732
a r 46663
o f 46540
e d 45972
a s 41141
e s 40357
i s 40255
i c 39046
i t 34395
i on 30651
t o 28896
i d 28064
r o 27928
in g 24980
l e 24439
c h 23511
an d 23140
o u 23025
en t 22118
a m 18236
w as 18009
s e 17619
o w 17371
o m 17191
o l 16803
i l 16727
b e 15964
d id 15945
c t 15576
d e 14560
v e 14100
c e 13927
at ion 13830
f or 13696
u r 13528
u s 13157
u n 12879
a d 12267
i g 12257
i r 11680
m an 11492
c on 11469
p e 11438
g e 11020
i m 10582
h ow 10493
o p 10488
l a 10444
t er 9824
l y 9780
' s 9765
wh o 9764
v er 9665
a re 9641
d o 9609
er e 9381
c om 9093
t e 8848
m o 8706
u l 8643
r a 8536
n e 8278
r i 8076
m a 8022
ic h 7665
0 0 7615
d i 7584
p er 7505
p ro 7492
at e 7421
re s 7376
it y 7319
a b 7279
1 9 7206
l i 7048
wh en 6806
e m 6736
e ar 6711
ou n 6664
a p 6645
wh ich 6628
c i 6588
l o 6451
e l 6217
w i 6157
al l 6094
m ent 607

In [29]:
random.seed(42)

qa = list(zip(question_list, answer_list))
random.shuffle(qa)
question_list, answer_list = zip(*qa)

tokenizer = pkl.load(open("tokenizer_QA8000.pkl", "rb"))
tokenizer.create_hash()

corpus = fused_qa(question_list, answer_list, tokenizer)
pkl.dump(corpus, open('corpus/QA_8k.pkl', 'wb'))


  0%|          | 0/86821 [00:00<?, ?it/s]

## 