#                    Question Answering Capstone Project - Reading in JSON/Data Wrangling

### Importing Modules, Defining Custom Functions

In [1]:
import numpy as np
import pandas as pd

"""Extracts 'text' and 'answer_start' fields from the JSON to aid in creation of the pandas dataframe. It loops over the 
questions associated with a 'context' and depending on if it's an impossible question or not, reads the data appropriately.
The final 'else' statement deals with rare instances where the value of the 'plausible_answers' key is an empty list with no
'text' or 'answer_start fields'. Without the else, a 'ArrayIndexOutOfBoundsException is thrown'."""

def answer_data(context, impossible_question):
    dict_list=[]
    for question_idx in range(len(context["qas"])): # iterate over questions
        if impossible_question[question_idx]==False: # if question is answerable, all is ok, we read the data
            dict_list.append({"answer_text":context["qas"][question_idx]["answers"][0]["text"], 
                              "answer_start":context["qas"][question_idx]["answers"][0]["answer_start"]})
        elif impossible_question[question_idx]==True and len(context["qas"][question_idx]["plausible_answers"])!=0:
            dict_list.append({"answer_text":context["qas"][question_idx]["plausible_answers"][0]["text"],
                              "answer_start":context["qas"][question_idx]["plausible_answers"][0]["answer_start"]})
        else: # if 'plausible_answers' dict is an empty list
             dict_list.append({"answer_text":np.NaN, "answer_start":np.NaN})
    return dict_list   

### Importing and Structuring the Training Set

In [2]:
file=pd.read_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")[["data"]].to_json("train_file.json")
train_file=pd.read_json("train_file.json", orient="split").transpose().reset_index(drop=True)
train_file["title"]=train_file.loc[:,"title"].str.replace("_"," ")
train_themes=train_file["title"].values

In [3]:
# Creating dataframes for all themes within the training data set
train_df=pd.DataFrame()

# Looping over each theme in the JSON
for n_theme, theme in enumerate(train_themes):
    theme_contexts=train_file.iloc[n_theme,0] 
    theme_df=pd.DataFrame()
    # Going over all 'contexts' (paragraphs of data) within theme
    for context in theme_contexts:
        context_df=pd.DataFrame(data=context["qas"]).loc[:,["question", "id", "is_impossible"]] 
        context_df["context"]=context["context"]
        impossible_question=context_df["is_impossible"].values #boolean array to denote if given question of context is impossible
        context_df=context_df.join(pd.DataFrame(data=answer_data(context, impossible_question)))
        
        theme_df=theme_df.append(context_df, ignore_index=True)
        
    theme_df["title"]=theme
    train_df=train_df.append(theme_df, ignore_index=True) 
       
    # Some debugging/progress prints
    print("Added '%s' theme." % theme)
    print(n_theme+1,"/",len(train_themes), "  ------   ", (float(n_theme+1)/float(len(train_themes)))*100, "%")
    print(theme_df.info(),"\n")
   
print(train_df)
print(train_df.info())

Added 'Beyoncé' theme.
1 / 442   ------    0.22624434389140274 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 7 columns):
question         753 non-null object
id               753 non-null object
is_impossible    753 non-null bool
context          753 non-null object
answer_start     753 non-null int64
answer_text      753 non-null object
title            753 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 36.1+ KB
None 

Added 'Frédéric Chopin' theme.
2 / 442   ------    0.4524886877828055 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 7 columns):
question         697 non-null object
id               697 non-null object
is_impossible    697 non-null bool
context          697 non-null object
answer_start     697 non-null int64
answer_text      697 non-null object
title            697 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 33.4+ KB
None 

Added 'Ka

Added 'Computer security' theme.
18 / 442   ------    4.072398190045249 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 7 columns):
question         365 non-null object
id               365 non-null object
is_impossible    365 non-null bool
context          365 non-null object
answer_start     365 non-null int64
answer_text      365 non-null object
title            365 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 17.5+ KB
None 

Added 'Orthodox Judaism' theme.
19 / 442   ------    4.298642533936651 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
question         240 non-null object
id               240 non-null object
is_impossible    240 non-null bool
context          240 non-null object
answer_start     240 non-null int64
answer_text      240 non-null object
title            240 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.6+ KB
None 


Added 'Gregorian calendar' theme.
35 / 442   ------    7.918552036199094 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 7 columns):
question         210 non-null object
id               210 non-null object
is_impossible    210 non-null bool
context          210 non-null object
answer_start     210 non-null int64
answer_text      210 non-null object
title            210 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 10.1+ KB
None 

Added 'Dog' theme.
36 / 442   ------    8.144796380090497 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 7 columns):
question         392 non-null object
id               392 non-null object
is_impossible    392 non-null bool
context          392 non-null object
answer_start     392 non-null int64
answer_text      392 non-null object
title            392 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 18.8+ KB
None 

Added 'Xbox

Added 'Macintosh' theme.
52 / 442   ------    11.76470588235294 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487 entries, 0 to 486
Data columns (total 7 columns):
question         487 non-null object
id               487 non-null object
is_impossible    487 non-null bool
context          487 non-null object
answer_start     487 non-null int64
answer_text      487 non-null object
title            487 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 23.4+ KB
None 

Added 'Anti-aircraft warfare' theme.
53 / 442   ------    11.990950226244344 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 7 columns):
question         340 non-null object
id               340 non-null object
is_impossible    340 non-null bool
context          340 non-null object
answer_start     340 non-null int64
answer_text      340 non-null object
title            340 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 16.3+ KB
None 

A

69 / 442   ------    15.610859728506787 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 7 columns):
question         198 non-null object
id               198 non-null object
is_impossible    198 non-null bool
context          198 non-null object
answer_start     198 non-null int64
answer_text      198 non-null object
title            198 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 9.6+ KB
None 

Added 'Capital punishment in the United States' theme.
70 / 442   ------    15.837104072398189 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 7 columns):
question         422 non-null object
id               422 non-null object
is_impossible    422 non-null bool
context          422 non-null object
answer_start     422 non-null int64
answer_text      422 non-null object
title            422 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 20.3+ KB
None 

Added 'A

Added 'Bill %26 Melinda Gates Foundation' theme.
86 / 442   ------    19.457013574660635 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 7 columns):
question         185 non-null object
id               185 non-null object
is_impossible    185 non-null bool
context          185 non-null object
answer_start     185 non-null int64
answer_text      185 non-null object
title            185 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 8.9+ KB
None 

Added 'Montevideo' theme.
87 / 442   ------    19.683257918552037 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 7 columns):
question         247 non-null object
id               247 non-null object
is_impossible    247 non-null bool
context          247 non-null object
answer_start     247 non-null int64
answer_text      247 non-null object
title            247 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.9

103 / 442   ------    23.30316742081448 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 7 columns):
question         239 non-null object
id               239 non-null object
is_impossible    239 non-null bool
context          239 non-null object
answer_start     239 non-null int64
answer_text      239 non-null object
title            239 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.5+ KB
None 

Added 'Professional wrestling' theme.
104 / 442   ------    23.52941176470588 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 7 columns):
question         245 non-null object
id               245 non-null object
is_impossible    245 non-null bool
context          245 non-null object
answer_start     245 non-null int64
answer_text      245 non-null object
title            245 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.8+ KB
None 

Added 'Film speed' theme

Added 'Marvel Comics' theme.
120 / 442   ------    27.149321266968325 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 7 columns):
question         247 non-null object
id               247 non-null object
is_impossible    247 non-null bool
context          247 non-null object
answer_start     247 non-null int64
answer_text      247 non-null object
title            247 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.9+ KB
None 

Added 'British Empire' theme.
121 / 442   ------    27.37556561085973 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 7 columns):
question         293 non-null object
id               293 non-null object
is_impossible    293 non-null bool
context          293 non-null object
answer_start     293 non-null int64
answer_text      293 non-null object
title            293 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 14.1+ KB
None 

Ad

Added 'Gothic architecture' theme.
137 / 442   ------    30.995475113122172 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 7 columns):
question         375 non-null object
id               375 non-null object
is_impossible    375 non-null bool
context          375 non-null object
answer_start     375 non-null int64
answer_text      375 non-null object
title            375 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 18.0+ KB
None 

Added 'Cubism' theme.
138 / 442   ------    31.221719457013574 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 7 columns):
question         294 non-null object
id               294 non-null object
is_impossible    294 non-null bool
context          294 non-null object
answer_start     294 non-null int64
answer_text      294 non-null object
title            294 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 14.1+ KB
None 

Add

Added 'Avicenna' theme.
154 / 442   ------    34.841628959276015 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 7 columns):
question         440 non-null object
id               440 non-null object
is_impossible    440 non-null bool
context          440 non-null object
answer_start     440 non-null int64
answer_text      440 non-null object
title            440 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 21.1+ KB
None 

Added 'Chinese characters' theme.
155 / 442   ------    35.06787330316742 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 7 columns):
question         168 non-null object
id               168 non-null object
is_impossible    168 non-null bool
context          168 non-null object
answer_start     168 non-null int64
answer_text      168 non-null object
title            168 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 8.1+ KB
None 

Adde

Added 'Alsace' theme.
171 / 442   ------    38.68778280542987 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 7 columns):
question         228 non-null object
id               228 non-null object
is_impossible    228 non-null bool
context          228 non-null object
answer_start     228 non-null int64
answer_text      228 non-null object
title            228 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.0+ KB
None 

Added 'Carnival' theme.
172 / 442   ------    38.91402714932127 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 7 columns):
question         259 non-null object
id               259 non-null object
is_impossible    259 non-null bool
context          259 non-null object
answer_start     259 non-null int64
answer_text      259 non-null object
title            259 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 12.5+ KB
None 

Added 'Baptists'

Added 'Dominican Order' theme.
188 / 442   ------    42.53393665158371 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 7 columns):
question         413 non-null object
id               413 non-null object
is_impossible    413 non-null bool
context          413 non-null object
answer_start     413 non-null int64
answer_text      413 non-null object
title            413 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 19.8+ KB
None 

Added 'Eton College' theme.
189 / 442   ------    42.76018099547511 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 7 columns):
question         374 non-null object
id               374 non-null object
is_impossible    374 non-null bool
context          374 non-null object
answer_start     374 non-null int64
answer_text      374 non-null object
title            374 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 18.0+ KB
None 

Add

Added 'Strasbourg' theme.
205 / 442   ------    46.380090497737555 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 7 columns):
question         321 non-null object
id               321 non-null object
is_impossible    321 non-null bool
context          321 non-null object
answer_start     321 non-null int64
answer_text      321 non-null object
title            321 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 15.4+ KB
None 

Added 'Oklahoma' theme.
206 / 442   ------    46.60633484162896 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 7 columns):
question         271 non-null object
id               271 non-null object
is_impossible    271 non-null bool
context          271 non-null object
answer_start     271 non-null int64
answer_text      271 non-null object
title            271 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 13.0+ KB
None 

Added 'Hist

Added 'Federalism' theme.
222 / 442   ------    50.2262443438914 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 7 columns):
question         338 non-null object
id               338 non-null object
is_impossible    338 non-null bool
context          338 non-null object
answer_start     338 non-null int64
answer_text      338 non-null object
title            338 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 16.3+ KB
None 

Added 'Annelid' theme.
223 / 442   ------    50.452488687782804 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 7 columns):
question         314 non-null object
id               314 non-null object
is_impossible    314 non-null bool
context          314 non-null object
answer_start     314 non-null int64
answer_text      314 non-null object
title            314 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 15.1+ KB
None 

Added 'IPod' 

Added 'Liberal Party of Australia' theme.
239 / 442   ------    54.07239819004525 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 7 columns):
question         216 non-null object
id               216 non-null object
is_impossible    216 non-null bool
context          216 non-null object
answer_start     216 non-null int64
answer_text      216 non-null object
title            216 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 10.4+ KB
None 

Added 'Samurai' theme.
240 / 442   ------    54.29864253393665 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 7 columns):
question         235 non-null object
id               235 non-null object
is_impossible    235 non-null bool
context          235 non-null object
answer_start     235 non-null int64
answer_text      235 non-null object
title            235 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.3+ KB
None

Added 'East Prussia' theme.
256 / 442   ------    57.9185520361991 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 7 columns):
question         211 non-null object
id               211 non-null object
is_impossible    211 non-null bool
context          211 non-null object
answer_start     211 non-null int64
answer_text      211 non-null object
title            211 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 10.2+ KB
None 

Added 'Ottoman Empire' theme.
257 / 442   ------    58.144796380090504 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 7 columns):
question         340 non-null object
id               340 non-null object
is_impossible    340 non-null bool
context          340 non-null object
answer_start     340 non-null int64
answer_text      340 non-null object
title            340 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 16.3+ KB
None 

Adde

Added 'Education' theme.
273 / 442   ------    61.76470588235294 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 7 columns):
question         267 non-null object
id               267 non-null object
is_impossible    267 non-null bool
context          267 non-null object
answer_start     267 non-null int64
answer_text      267 non-null object
title            267 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 12.9+ KB
None 

Added 'Tennessee' theme.
274 / 442   ------    61.990950226244344 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 7 columns):
question         232 non-null object
id               232 non-null object
is_impossible    232 non-null bool
context          232 non-null object
answer_start     232 non-null int64
answer_text      232 non-null object
title            232 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.2+ KB
None 

Added 'Post

Added 'Vacuum' theme.
290 / 442   ------    65.61085972850678 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 7 columns):
question         282 non-null object
id               282 non-null object
is_impossible    282 non-null bool
context          282 non-null object
answer_start     282 non-null int64
answer_text      282 non-null object
title            282 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 13.6+ KB
None 

Added 'Materialism' theme.
291 / 442   ------    65.83710407239819 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 7 columns):
question         203 non-null object
id               203 non-null object
is_impossible    203 non-null bool
context          203 non-null object
answer_start     203 non-null int64
answer_text      203 non-null object
title            203 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 9.8+ KB
None 

Added 'Han dyn

Added 'Database' theme.
307 / 442   ------    69.45701357466064 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 7 columns):
question         272 non-null object
id               272 non-null object
is_impossible    272 non-null bool
context          272 non-null object
answer_start     272 non-null int64
answer_text      272 non-null object
title            272 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 13.1+ KB
None 

Added 'Tucson, Arizona' theme.
308 / 442   ------    69.68325791855203 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 7 columns):
question         370 non-null object
id               370 non-null object
is_impossible    370 non-null bool
context          370 non-null object
answer_start     370 non-null int64
answer_text      370 non-null object
title            370 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 17.8+ KB
None 

Added '

Added 'Oklahoma City' theme.
324 / 442   ------    73.30316742081448 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 7 columns):
question         179 non-null object
id               179 non-null object
is_impossible    179 non-null bool
context          179 non-null object
answer_start     179 non-null int64
answer_text      179 non-null object
title            179 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 8.6+ KB
None 

Added 'Alfred North Whitehead' theme.
325 / 442   ------    73.52941176470588 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 7 columns):
question         502 non-null object
id               502 non-null object
is_impossible    502 non-null bool
context          502 non-null object
answer_start     502 non-null int64
answer_text      502 non-null object
title            502 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 24.1+ KB
Non

Added 'Light-emitting diode' theme.
341 / 442   ------    77.14932126696833 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 7 columns):
question         385 non-null object
id               385 non-null object
is_impossible    385 non-null bool
context          385 non-null object
answer_start     385 non-null int64
answer_text      385 non-null object
title            385 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 18.5+ KB
None 

Added 'Great power' theme.
342 / 442   ------    77.37556561085974 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 7 columns):
question         188 non-null object
id               188 non-null object
is_impossible    188 non-null bool
context          188 non-null object
answer_start     188 non-null int64
answer_text      188 non-null object
title            188 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 9.1+ KB
None 



Added 'Russian Soviet Federative Socialist Republic' theme.
358 / 442   ------    80.99547511312217 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345 entries, 0 to 344
Data columns (total 7 columns):
question         345 non-null object
id               345 non-null object
is_impossible    345 non-null bool
context          345 non-null object
answer_start     345 non-null int64
answer_text      345 non-null object
title            345 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 16.6+ KB
None 

Added 'Armenians' theme.
359 / 442   ------    81.22171945701358 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 7 columns):
question         243 non-null object
id               243 non-null object
is_impossible    243 non-null bool
context          243 non-null object
answer_start     243 non-null int64
answer_text      243 non-null object
title            243 non-null object
dtypes: bool(1), int64(1), object(5)
memory 

Added 'Punjab, Pakistan' theme.
375 / 442   ------    84.84162895927602 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
question         250 non-null object
id               250 non-null object
is_impossible    250 non-null bool
context          250 non-null object
answer_start     250 non-null int64
answer_text      250 non-null object
title            250 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 12.0+ KB
None 

Added 'Richmond, Virginia' theme.
376 / 442   ------    85.06787330316742 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 7 columns):
question         291 non-null object
id               291 non-null object
is_impossible    291 non-null bool
context          291 non-null object
answer_start     291 non-null int64
answer_text      291 non-null object
title            291 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 14.0+ KB
Non

Added 'Canon law' theme.
392 / 442   ------    88.68778280542986 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 7 columns):
question         195 non-null object
id               195 non-null object
is_impossible    195 non-null bool
context          195 non-null object
answer_start     195 non-null int64
answer_text      195 non-null object
title            195 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 9.4+ KB
None 

Added 'Communications in Somalia' theme.
393 / 442   ------    88.91402714932126 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 7 columns):
question         235 non-null object
id               235 non-null object
is_impossible    235 non-null bool
context          235 non-null object
answer_start     235 non-null int64
answer_text      235 non-null object
title            235 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.3+ KB
None

Added 'Bern' theme.
409 / 442   ------    92.53393665158372 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 7 columns):
question         164 non-null object
id               164 non-null object
is_impossible    164 non-null bool
context          164 non-null object
answer_start     164 non-null int64
answer_text      164 non-null object
title            164 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 7.9+ KB
None 

Added 'New York City' theme.
410 / 442   ------    92.76018099547511 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817 entries, 0 to 816
Data columns (total 7 columns):
question         817 non-null object
id               817 non-null object
is_impossible    817 non-null bool
context          817 non-null object
answer_start     817 non-null int64
answer_text      817 non-null object
title            817 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 39.2+ KB
None 

Added 'Westmin

Added 'Brain' theme.
426 / 442   ------    96.38009049773756 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 7 columns):
question         222 non-null object
id               222 non-null object
is_impossible    222 non-null bool
context          222 non-null object
answer_start     222 non-null int64
answer_text      222 non-null object
title            222 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 10.7+ KB
None 

Added 'Near East' theme.
427 / 442   ------    96.60633484162896 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Data columns (total 7 columns):
question         229 non-null object
id               229 non-null object
is_impossible    229 non-null bool
context          229 non-null object
answer_start     229 non-null int64
answer_text      229 non-null object
title            229 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 11.0+ KB
None 

Added 'Zhejiang'

### Importing and Structuring the Development Set
Having constructed a dataframe for the training set, we will now do the same for the dev set.

In [4]:
dev_file=pd.read_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json")[["data"]].to_json("dev_file.json")
dev_file2=pd.read_json("dev_file.json", orient="split").transpose().reset_index(drop=True)
dev_file2["title"]=dev_file2.loc[:,"title"].str.replace("_"," ")
dev_themes=dev_file2["title"].values

In [5]:
# Creating dataframes for all themes
dev_df=pd.DataFrame()

# Looping over each theme in the JSON
for n_theme, theme in enumerate(dev_themes):
    theme_contexts=dev_file2.iloc[n_theme,0] 
    theme_df=pd.DataFrame()
    # Going over all 'contexts' (paragraphs of data) within theme
    for context in theme_contexts:
        context_df=pd.DataFrame(data=context["qas"]).loc[:,["question", "id", "is_impossible"]] 
        context_df["context"]=context["context"]
        impossible_question=context_df["is_impossible"].values #boolean array to denote if given question of context is impossible
        context_df=context_df.join(pd.DataFrame(data=answer_data(context, impossible_question)))
        theme_df=theme_df.append(context_df, ignore_index=True)
        
    theme_df["title"]=theme
    dev_df=dev_df.append(theme_df, ignore_index=True) 
       
    # Some debugging/progress prints
    print("Added '%s' theme." % theme)
    print(n_theme+1,"/",len(dev_themes), "  ------   ", (float(n_theme+1)/float(len(dev_themes)))*100, "%")
    print(theme_df.info(),"\n")
   
print(dev_df)
print(dev_df.info())

Added 'Normans' theme.
1 / 35   ------    2.857142857142857 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 7 columns):
question         208 non-null object
id               208 non-null object
is_impossible    208 non-null bool
context          208 non-null object
answer_start     208 non-null int64
answer_text      208 non-null object
title            208 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 10.0+ KB
None 

Added 'Computational complexity theory' theme.
2 / 35   ------    5.714285714285714 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
question         418 non-null object
id               418 non-null object
is_impossible    418 non-null bool
context          418 non-null object
answer_start     418 non-null int64
answer_text      418 non-null object
title            418 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 20.1+ KB
None 

Added 'Yuan dynasty' theme.
18 / 35   ------    51.42857142857142 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 7 columns):
question         445 non-null object
id               445 non-null object
is_impossible    445 non-null bool
context          445 non-null object
answer_start     445 non-null int64
answer_text      445 non-null object
title            445 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 21.4+ KB
None 

Added 'Immune system' theme.
19 / 35   ------    54.285714285714285 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 7 columns):
question         458 non-null object
id               458 non-null object
is_impossible    458 non-null bool
context          458 non-null object
answer_start     458 non-null int64
answer_text      458 non-null object
title            458 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 22.0+ KB
None 

Added 'I

Added 'European Union law' theme.
35 / 35   ------    100.0 %
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 7 columns):
question         421 non-null object
id               421 non-null object
is_impossible    421 non-null bool
context          421 non-null object
answer_start     421 non-null int64
answer_text      421 non-null object
title            421 non-null object
dtypes: bool(1), int64(1), object(5)
memory usage: 20.2+ KB
None 

                                                question  \
0                   In what country is Normandy located?   
1                     When were the Normans in Normandy?   
2          From which countries did the Norse originate?   
3                              Who was the Norse leader?   
4      What century did the Normans first gain their ...   
5      Who gave their name to Normandy in the 1000's ...   
6                            What is France a region of?   
7              Who did King Cha

### Exporting dataframes to .csv files
Having completed the data wrangling stages, we will now export the dataframes as .csv files ready to be imported in the next stage of the project.

In [6]:
train_df.to_json("C:/Users/Lukas Buteliauskas/Desktop/training_data.json")
dev_df.to_json("C:/Users/Lukas Buteliauskas/Desktop/validation_data.json")