# Reading in JSON/Data Wrangling

In [1]:
import numpy as np
import pandas as pd

def answer_data(context, impossible_question):
    """ Extracts 'text' and 'answer_start' fields from the JSON to aid in creation of the pandas dataframe. It loops over the 
    questions associated with a 'context' and depending on if it's an impossible question or not, reads the data appropriately.
    The final 'else' statement deals with rare instances where the value of the 'plausible_answers' key is an empty list with no
    'text' or 'answer_start fields'. Without the else, a 'ArrayIndexOutOfBoundsException is thrown'. """
    dict_list=[]
    for question_idx in range(len(context["qas"])): # iterate over questions
        if impossible_question[question_idx]==False: # if question is answerable, all is ok, we read the data
            dict_list.append({"answer_text":context["qas"][question_idx]["answers"][0]["text"], 
                              "answer_start":context["qas"][question_idx]["answers"][0]["answer_start"]})
        elif impossible_question[question_idx]==True and len(context["qas"][question_idx]["plausible_answers"])!=0:
            dict_list.append({"answer_text":context["qas"][question_idx]["plausible_answers"][0]["text"],
                              "answer_start":context["qas"][question_idx]["plausible_answers"][0]["answer_start"]})
        else: # if 'plausible_answers' dict is an empty list
             dict_list.append({"answer_text":np.NaN, "answer_start":np.NaN})
    return dict_list   

### Importing and Structuring the Training Set

In [2]:
# Reading in training dataset
#file = pd.read_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")[["data"]].to_json("train_file.json")
train_file = pd.read_json("data/train_file.json", orient="split").transpose().reset_index(drop=True)

# Getting rid of underscores in the 'title' column entries.
train_file["title"] = train_file.loc[:,"title"].str.replace("_"," ")

# Getting a numpy array of the Wikipedia article titles
train_themes = train_file["title"].values

In [3]:
# Creating dataframes for all themes within the training data set
train_df=pd.DataFrame()

# Looping over each theme in the JSON
for n_theme, theme in enumerate(train_themes):
    theme_contexts=train_file.iloc[n_theme,0] 
    theme_df=pd.DataFrame()
    # Going over all 'contexts' (paragraphs of data) within theme
    for context in theme_contexts:
        context_df=pd.DataFrame(data=context["qas"]).loc[:,["question", "id", "is_impossible"]] 
        context_df["context"]=context["context"]
        impossible_question=context_df["is_impossible"].values #bool array to denote if question is impossible
        context_df=context_df.join(pd.DataFrame(data=answer_data(context, impossible_question)))
        
        theme_df=theme_df.append(context_df, ignore_index=True)
        
    theme_df["title"]=theme
    train_df=train_df.append(theme_df, ignore_index=True) 
       
    # Status print
    print("%d/%d ---- %.3f%%" % (n_theme+1, len(train_themes), (float(n_theme+1)/len(train_themes))*100))
   
print(train_df)
print(train_df.info())

1/442 ---- 0.226%
2/442 ---- 0.452%
3/442 ---- 0.679%
4/442 ---- 0.905%
5/442 ---- 1.131%
6/442 ---- 1.357%
7/442 ---- 1.584%
8/442 ---- 1.810%
9/442 ---- 2.036%
10/442 ---- 2.262%
11/442 ---- 2.489%
12/442 ---- 2.715%
13/442 ---- 2.941%
14/442 ---- 3.167%
15/442 ---- 3.394%
16/442 ---- 3.620%
17/442 ---- 3.846%
18/442 ---- 4.072%
19/442 ---- 4.299%
20/442 ---- 4.525%
21/442 ---- 4.751%
22/442 ---- 4.977%
23/442 ---- 5.204%
24/442 ---- 5.430%
25/442 ---- 5.656%
26/442 ---- 5.882%
27/442 ---- 6.109%
28/442 ---- 6.335%
29/442 ---- 6.561%
30/442 ---- 6.787%
31/442 ---- 7.014%
32/442 ---- 7.240%
33/442 ---- 7.466%
34/442 ---- 7.692%
35/442 ---- 7.919%
36/442 ---- 8.145%
37/442 ---- 8.371%
38/442 ---- 8.597%
39/442 ---- 8.824%
40/442 ---- 9.050%
41/442 ---- 9.276%
42/442 ---- 9.502%
43/442 ---- 9.729%
44/442 ---- 9.955%
45/442 ---- 10.181%
46/442 ---- 10.407%
47/442 ---- 10.633%
48/442 ---- 10.860%
49/442 ---- 11.086%
50/442 ---- 11.312%
51/442 ---- 11.538%
52/442 ---- 11.765%
53/442 ---- 1

399/442 ---- 90.271%
400/442 ---- 90.498%
401/442 ---- 90.724%
402/442 ---- 90.950%
403/442 ---- 91.176%
404/442 ---- 91.403%
405/442 ---- 91.629%
406/442 ---- 91.855%
407/442 ---- 92.081%
408/442 ---- 92.308%
409/442 ---- 92.534%
410/442 ---- 92.760%
411/442 ---- 92.986%
412/442 ---- 93.213%
413/442 ---- 93.439%
414/442 ---- 93.665%
415/442 ---- 93.891%
416/442 ---- 94.118%
417/442 ---- 94.344%
418/442 ---- 94.570%
419/442 ---- 94.796%
420/442 ---- 95.023%
421/442 ---- 95.249%
422/442 ---- 95.475%
423/442 ---- 95.701%
424/442 ---- 95.928%
425/442 ---- 96.154%
426/442 ---- 96.380%
427/442 ---- 96.606%
428/442 ---- 96.833%
429/442 ---- 97.059%
430/442 ---- 97.285%
431/442 ---- 97.511%
432/442 ---- 97.738%
433/442 ---- 97.964%
434/442 ---- 98.190%
435/442 ---- 98.416%
436/442 ---- 98.643%
437/442 ---- 98.869%
438/442 ---- 99.095%
439/442 ---- 99.321%
440/442 ---- 99.548%
441/442 ---- 99.774%
442/442 ---- 100.000%
                                                 question  \
0             

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 7 columns):
question         130319 non-null object
id               130319 non-null object
is_impossible    130319 non-null bool
context          130319 non-null object
answer_start     130319 non-null int64
answer_text      130319 non-null object
title            130319 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 6.1+ MB
None


### Importing and Structuring the Development Set
Having constructed a dataframe for the training set, we will now do the same for the dev set.

In [4]:
#dev_file=pd.read_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json")[["data"]].to_json("dev_file.json")
dev_file2=pd.read_json("data/dev_file.json", orient="split").transpose().reset_index(drop=True)

dev_file2["title"]=dev_file2.loc[:,"title"].str.replace("_"," ")
dev_themes=dev_file2["title"].values

In [5]:
# Creating dataframes for all themes
dev_df=pd.DataFrame()

# Looping over each theme in the JSON
for n_theme, theme in enumerate(dev_themes):
    theme_contexts=dev_file2.iloc[n_theme,0] 
    theme_df=pd.DataFrame()
    # Going over all 'contexts' (paragraphs of data) within theme
    for context in theme_contexts:
        context_df=pd.DataFrame(data=context["qas"]).loc[:,["question", "id", "is_impossible"]] 
        context_df["context"]=context["context"]
        impossible_question=context_df["is_impossible"].values
        context_df=context_df.join(pd.DataFrame(data=answer_data(context, impossible_question)))
        theme_df=theme_df.append(context_df, ignore_index=True)
        
    theme_df["title"]=theme
    dev_df=dev_df.append(theme_df, ignore_index=True) 
       
    # Status print
    print("%d/%d ---- %.3f%%" % (n_theme+1, len(dev_themes), (float(n_theme+1)/len(dev_themes))*100))
   
print(dev_df)
print(dev_df.info())

1/35 ---- 2.857%
2/35 ---- 5.714%
3/35 ---- 8.571%
4/35 ---- 11.429%
5/35 ---- 14.286%
6/35 ---- 17.143%
7/35 ---- 20.000%
8/35 ---- 22.857%
9/35 ---- 25.714%
10/35 ---- 28.571%
11/35 ---- 31.429%
12/35 ---- 34.286%
13/35 ---- 37.143%
14/35 ---- 40.000%
15/35 ---- 42.857%
16/35 ---- 45.714%
17/35 ---- 48.571%
18/35 ---- 51.429%
19/35 ---- 54.286%
20/35 ---- 57.143%
21/35 ---- 60.000%
22/35 ---- 62.857%
23/35 ---- 65.714%
24/35 ---- 68.571%
25/35 ---- 71.429%
26/35 ---- 74.286%
27/35 ---- 77.143%
28/35 ---- 80.000%
29/35 ---- 82.857%
30/35 ---- 85.714%
31/35 ---- 88.571%
32/35 ---- 91.429%
33/35 ---- 94.286%
34/35 ---- 97.143%
35/35 ---- 100.000%
                                                question  \
0                   In what country is Normandy located?   
1                     When were the Normans in Normandy?   
2          From which countries did the Norse originate?   
3                              Who was the Norse leader?   
4      What century did the Normans first gain

### Exporting dataframes to .csv files
Having completed the data wrangling stages, we will now export the dataframes as .csv files ready to be imported in the next stage of the project.

In [6]:
train_df.to_json("C:/Users/Lukas Buteliauskas/Desktop/training_data.json")
dev_df.to_json("C:/Users/Lukas Buteliauskas/Desktop/validation_data.json")