In [2]:
import json
import time 
import pandas as pd 
import torch
import spacy 
import torchtext
from collections import Counter,OrderedDict 
from torchtext.data.utils import get_tokenizer 

In [3]:
tokenizer = get_tokenizer("basic_english")

In [4]:
path = '../data/squad/train-v1.1.json'

In [5]:
with open(path, 'r', encoding = 'utf-8') as f:
    data = json.load(f)


In [6]:
def parser_data(data):
    '''
    
    
    '''
    print("Parsering from json to DataFrame ............")
    start = time.time()
    data = data['data'] 
    qa_list = [] 
    for paragraphs in data:
        for para in paragraphs['paragraphs']:
            context = para['context'] 
            for qa in para['qas']:
                id = qa['id'] 
                question = qa['question']
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)

                    qa_dict = dict() 
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['answer'] = answer 
                    qa_dict['ans_start'] = ans_start 
                    qa_dict['ans_end'] = ans_end 
                    qa_list.append(qa_dict) 
    end = time.time() 
    print("Number of Q/A: ",len(qa_list))
    print(f"Parser data from json to DataFrame in {end- start}s")
    print("--------------------------------------------------------------------")
    return pd.DataFrame(qa_list) 

In [7]:
df = parser_data(data)

Parsering from json to DataFrame ............
Number of Q/A:  87599
Parser data from json to DataFrame in 0.3076024055480957s
--------------------------------------------------------------------


In [8]:
df

Unnamed: 0,context,question,answer,ans_start,ans_end
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126
...,...,...,...,...,...
87594,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229,235
87595,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414,421
87596,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476,481
87597,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199,203


In [9]:
def text2vocab(data):
    ''' 
    
    
    
    '''
    print("Building text vocab ...............")
    text = [] 
    total = 0 
    start = time.time()
    for paragraphs in data:
        context_unique = list(paragraphs.context.unique())
        question_unique = list(paragraphs.question.unique()) 
        text.extend(context_unique)
        text.extend(question_unique) 
    print("Sum of context + question: ",len(text)) 
    end = time.time()
    print(f"Build text vocab in {end-start}s")
    print("-------------------------------------------------------------------")
    return text 


In [10]:
text_vocab = text2vocab([df])

Building text vocab ...............
Sum of context + question:  106246
Build text vocab in 0.5262694358825684s
-------------------------------------------------------------------


In [11]:
from torchtext.vocab import vocab

In [12]:
def build_word_vocab(vocab_text):
    '''
    
    
    '''
    print("Building word vocab ..................")
    start = time.time()
    words = []
    for seq in vocab_text:
        words.extend(tokenizer(seq))
    word_counter = Counter(words) 
    sorted_by_freq = sorted(word_counter.items(), key = lambda x:x[1],reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq)
    vocab_ = vocab(ordered_dict,specials=['<unk>'])
    end = time.time() 
    print(f"Len word vocab: {vocab_.__len__()}")
    print(f"Build word vocab in: {end - start}s")
    print("-------------------------------------------------------------------")
    return vocab_

In [13]:
word_vocab = build_word_vocab(text_vocab)

Building word vocab ..................
Len word vocab: 99177
Build word vocab in: 10.090356588363647s
-------------------------------------------------------------------


In [14]:
def build_char_vocab(vocab_text):
    ''' 
    
    
    '''
    print("Building char vocab ..........") 
    start = time.time() 
    chars = []
    for seq in vocab_text:
        for ch in seq:
            chars.append(ch) 
    char_counter = Counter(chars)
    sorted_by_freq = sorted(char_counter.items(), key = lambda x:x[1], reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq)
    vocab_ = vocab(ordered_dict,min_freq=20,specials = ['<unk>','<pad>'])
    print(f"Len char vocab: {vocab_.__len__()}")
    end = time.time()
    print(f"Build char vocab in: {end - start}s")
    print("-------------------------------------------------------------------")
    return vocab_

In [15]:
char_vocab = build_char_vocab(text_vocab)

Building char vocab ..........
Len char vocab: 230
Build char vocab in: 5.175764083862305s
-------------------------------------------------------------------


In [16]:
word_pipeline = lambda x: word_vocab(tokenizer(x))
char_pipeline = lambda x: char_vocab([i for i in x]) 


In [17]:
from torch.utils.data import Dataset

In [18]:
def convert_idx(text,ans_start,ans_end):
    current = 0
    spans = []
    answer_span = []
    text = text.replace('"',"'").lower()
    tokens = tokenizer(text)
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print(f"Token {token} cannot be found")
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    for idx,span in enumerate(spans):
        if not (ans_end <= span[0] or ans_start >= span[1]):
            answer_span.append(idx)
    y1,y2 = answer_span[0],answer_span[-1]
    return (y1,y2)

In [19]:
df['label'] = [convert_idx(x['context'],x['ans_start'],x['ans_end']) for _,x in df.iterrows()]

In [20]:
df['context_word'] = df['context'].apply(word_pipeline)
df['question_word'] = df['question'].apply(word_pipeline)

In [21]:
df

Unnamed: 0,context,question,answer,ans_start,ans_end,label,context_word,question_word
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541,"(103, 105)","[15185, 2, 1, 131, 38, 9, 545, 744, 4, 7958, 1...","[8, 556, 23, 1, 2718, 745, 6063, 1042, 5, 7784..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213,"(38, 42)","[15185, 2, 1, 131, 38, 9, 545, 744, 4, 7958, 1...","[10, 11, 5, 1186, 3, 1, 1092, 1045, 239, 304, 6]"
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296,"(58, 60)","[15185, 2, 1, 131, 38, 9, 545, 744, 4, 7958, 1...","[1, 4208, 3, 1, 3718, 1406, 29, 1092, 1045, 11..."
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420,"(77, 83)","[15185, 2, 1, 131, 38, 9, 545, 744, 4, 7958, 1...","[10, 11, 1, 18370, 29, 1092, 1045, 6]"
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126,"(18, 24)","[15185, 2, 1, 131, 38, 9, 545, 744, 4, 7958, 1...","[10, 8838, 22, 430, 3, 1, 239, 304, 29, 1092, ..."
...,...,...,...,...,...,...,...,...
87594,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229,235,"(39, 39)","[1500, 1174, 49, 20, 27388, 19, 2, 5, 229, 8, ...","[5, 10, 186, 76, 23, 1500, 41, 1452, 31, 181, ..."
87595,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414,421,"(71, 71)","[1500, 1174, 49, 20, 27388, 19, 2, 5, 229, 8, ...","[10, 12, 16983, 1022, 89, 14, 6]"
87596,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476,481,"(88, 88)","[1500, 1174, 49, 20, 27388, 19, 2, 5, 229, 8, ...","[21, 10, 49088, 49, 55, 1500, 37, 9, 797, 6]"
87597,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199,203,"(32, 32)","[1500, 1174, 49, 20, 27388, 19, 2, 5, 229, 8, ...","[5, 10, 58, 23, 1500, 701, 44, 1628, 181, 797, 6]"


In [22]:
from torch.utils.data import Dataset

In [32]:
df[['context','question']]

Unnamed: 0,context,question
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...
...,...,...
87594,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...
87595,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?
87596,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...
87597,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...


In [62]:
class Squad(Dataset):


In [56]:
data = Squad(df)

In [46]:
data

<__main__.Squad at 0x7f5fcef64750>

In [27]:
from torch.utils.data import DataLoader

In [57]:
train_loader=DataLoader(data,batch_size=10,shuffle=False,collate_fn= collate_batch)

In [58]:
for i,(data,label) in enumerate(train_loader):
    print(data)
    print(label)
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object

In [59]:

def make_char_vector(sentence,max_seq,max_word_ctx): 
    tokens = tokenizer(sentence)
    char_vec = torch.ones(max_seq,max_word_ctx).type(torch.LongTensor)
    for i, word in enumerate(tokens):
        for j, ch in enumerate(word):
            char_vec[i][j] = char_vocab[ch]
    return char_vec

In [71]:
class Squad():
    def __init__(self,data,batch_size):
        self.batch_size = batch_size
        data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
        self.data = data 
        # self.x_train = self.data[['context','question','context_word','question_word']]
        # self.y_train = self.data['label']
    def __len__(self):
        return len(self.data)
    def __getitem__(self,index):
        return self.data[index]
    def __iter__(self):
        for batch in self.data:
            max_seq = max([len(ctx) for ctx in batch['context_word']]) 
            padded_context_word =torch.LongTensor(len(batch),max_seq).fill_(1)
            for i,ctx in enumerate(batch['context_word']):
                padded_context_word[i,:len(ctx)] = torch.LongTensor(ctx)
            max_word_ctx = [[len(i) for i in tokenizer(context)]for context in batch['context']]
            max_word_ctx = max([max(i) for i in max_word_ctx])
            padded_context_char = torch.ones(len(batch),max_seq,max_word_ctx).type(torch.LongTensor)


            for i, context in enumerate(batch['context']):
                padded_context_char[i] = make_char_vector(context,max_seq,max_word_ctx) 
            
            max_seq_question = max([len(question) for question in batch['question']]) 
            padded_question_word = torch.LongTensor(len(batch),max_seq_question).fill_(1)
            for i,q in enumerate(batch['question_word']):
                padded_question_word[i,:len(q)] = torch.LongTensor(q) 
            max_word_question = [[len(i) for i in tokenizer(question)] for question in batch['question']]
            max_word_question = max([max(i) for i in max_word_question])
            padded_question_char = torch.ones(len(batch),max_seq_question,max_word_question).type(torch.LongTensor)
            for i,question in enumerate(batch['question']):
                padded_question_char[i] = make_char_vector(question,max_seq_question,max_word_question) 
            
            label = torch.LongTensor(list(batch['label']))

            yield padded_context_word,padded_context_char,padded_question_word,padded_question_char,label

In [81]:
len(df)

87599

In [72]:
data = Squad(df,8)

In [73]:
data.__getitem__(1)

Unnamed: 0,context,question,answer,ans_start,ans_end,label,context_word,question_word
8,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...,three,126,131,"(24, 24)","[14, 29, 51, 45, 612, 2, 1092, 1045, 13, 16, 3...","[34, 36, 1120, 1109, 3644, 24, 135, 29, 1092, ..."
9,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...,1987,908,912,"(160, 160)","[14, 29, 51, 45, 612, 2, 1092, 1045, 13, 16, 3...","[5, 10, 58, 23, 1, 1120, 782, 185, 1252, 357, ..."
10,The university is the major seat of the Congre...,Where is the headquarters of the Congregation ...,Rome,119,123,"(22, 22)","[1, 110, 11, 1, 123, 2826, 3, 1, 4569, 3, 1442...","[52, 11, 1, 1381, 3, 1, 4569, 3, 1, 1442, 1319..."
11,The university is the major seat of the Congre...,What is the primary seminary of the Congregati...,Moreau Seminary,145,160,"(29, 30)","[1, 110, 11, 1, 123, 2826, 3, 1, 4569, 3, 1442...","[10, 11, 1, 483, 6116, 3, 1, 4569, 3, 1, 1442,..."
12,The university is the major seat of the Congre...,What is the oldest structure at Notre Dame?,Old College,234,245,"(47, 48)","[1, 110, 11, 1, 123, 2826, 3, 1, 4569, 3, 1442...","[10, 11, 1, 871, 732, 29, 1092, 1045, 6]"
13,The university is the major seat of the Congre...,What individuals live at Fatima House at Notre...,Retired priests and brothers,356,384,"(70, 73)","[1, 110, 11, 1, 123, 2826, 3, 1, 4569, 3, 1442...","[10, 1069, 358, 29, 27418, 231, 29, 1092, 1045..."
14,The university is the major seat of the Congre...,Which prize did Frederick Buechner create?,Buechner Prize for Preaching,675,703,"(127, 130)","[1, 110, 11, 1, 123, 2826, 3, 1, 4569, 3, 1442...","[26, 1935, 23, 2258, 27419, 701, 6]"
15,The College of Engineering was established in ...,How many BS level degrees are offered in the C...,eight,487,492,"(79, 79)","[1, 292, 3, 1544, 12, 220, 5, 3366, 2, 121, 2,...","[34, 36, 26068, 469, 1493, 24, 1303, 5, 1, 292..."


In [82]:
87599/8

10949.875

In [79]:
data.__len__()

10950

In [66]:
for i in train_loader:
    print(i)
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object