In [12]:
import pandas as pd
import numpy as np
import os

In [66]:
# Extract columns from JSON file to have the following format
# article - questions - answers - option1 - option2 - option3 - option4
def process_json(path):
    # Read JSON into pandas
    df = pd.read_json(path)
    
    # Extract options column as a new dataframe
    options_df = pd.DataFrame(df.options.tolist(), columns=['option1', 'option2','option3','option4'])
    
    # Concatenate original dataframe with options_df
    df = pd.concat([df[['article','questions','answers']], options_df], sort = False, axis = 1)
    
    # Convert letter answer to option index
    df.answers = [ ord(x) - 65 for x in df.answers ]
    
    return df

In [84]:
# Create a massive dataframe with the whole dataset 
# as processed by the above function 

def create_dataframe_from_raw_RACE_data(data_path, split):
    # Initialize list to store all question dataframes
    data = []

    #  Iterate thru all files recurseverly and concat to data_df
    for root, subdirs, files in os.walk(data_path + split + '/'):
        for file in files:
            # Get file (article/questions) path
            file_path = root + '/' + file

            # Extract parsed df from file
            article_df = process_json(file_path)

            # Append to master df list
            data.append(article_df)

    # Create one big dataframe with all articles, questions etc
    data_df = pd.concat(data, axis = 0, sort = False)
    data_df = data_df.reset_index(drop = True)
    data_df['answers'] = data_df['answers'].astype(int)

    # Save data_df to file
    data_df.to_csv('Data/' + split + '_data.csv', index = False)

# This is the path of the data folder downloaded from CMU
data_path = 'Raw_Data/'

# For each split create a single dataframe with all data
# (this takes a while)
! mkdir Data
for split in ['train', 'dev', 'test']:
    create_dataframe_from_raw_RACE_data(data_path, split)
    print('Created ' + split + ' dataframe')

Created train dataframe
Created dev dataframe
Created test dataframe


In [85]:
pd.read_csv('test_data.csv')

Unnamed: 0,article,questions,answers,option1,option2,option3,option4
0,My name is Nancy. I'm twelve years old. I have...,Where is Wendy from?,3,China.,England.,America.,Australia.
1,My name is Nancy. I'm twelve years old. I have...,What colours does Nancy like?,3,Red and blue.,Red and yellow.,Green and yellow.,Green and blue.
2,My name is Nancy. I'm twelve years old. I have...,What's Wendy's favourite sport?,0,Running.,Basketball.,Football.,Table tennis.
3,My name is Nancy. I'm twelve years old. I have...,Which is TRUE ?,2,Nancy and Wendy are 12 years old.,Wendy is a student and she is English.,Everyone in Class Four likes Wendy.,Nancy has a cat and Wendy has a dog.
4,June 5 is World Environment Day.This makes us ...,What do Wang Baoxuan and his schoolmates do wi...,1,Throw them away,Collect and sell them,Cut them into pieces,Give them to the students in Inner Mongolia
5,June 5 is World Environment Day.This makes us ...,What is the money from selling the paper used ...,2,Buying new exercise books,Helping poor students,Planting trees and grass,Being their pocket money
6,"John gets up early from Monday to Saturday, be...",How often does John need to get up early?,3,Every day.,Five days a week.,Only on Saturdays and Sundays.,Six days a week.
7,"John gets up early from Monday to Saturday, be...",What does John do on Sunday morning?,3,He goes to have lessons.,He goes to a club.,He goes to the bookshop.,He watches TV.
8,"John gets up early from Monday to Saturday, be...",When does John do his weekend homework?,0,On Sunday evening.,On Sunday morning.,On Saturday evening.,On Sunday afternoon.
9,"John gets up early from Monday to Saturday, be...",Which of these is not right?,1,John watches TV after dinner on Saturdays.,John exercises on Sundays.,John plays computer games on Sunday afternoon.,John finishes his homework very late on Sundays.
