## make train and test files

In [1]:
import pandas as pd
import numpy as np
import json

def get_data(filepath):
    # get data from original dataset that contains columns 'Problem', 'correct', 'category', 'option'
    
    with open(filepath) as f:
        data = json.loads(f.read())
    df = pd.DataFrame(data)
    
    def get_options(df):
        # get the content of the options from the dataset
        opt_label = ['a', 'b', 'c', 'd', 'e']
        opts = list()
        for i in range(len(df)):
            options = df.iloc[i]['options']+" ,"
            answers = list()
            for label in opt_label:
                index_s = options.find(label+" ) ")+3
                index_len = options[index_s:].find(',')
                ans = options[index_s:index_s+index_len]
                answers.append(ans)
            opts.append(list(answers))
        opts = pd.DataFrame(opts, columns=['option_'+l for l in opt_label])
        return opts
    
    opts = get_options(df)
    new_data = df[['Problem', 'correct', 'category']].join(opts)
    return new_data


In [2]:
def generate_files(filepath, c):
    # generate the files without category
    
    data = get_data(filepath)
    fp = 'dataset/MathQA4project/00_'+c+'.csv'
    print(fp)
    data.to_csv(fp)
    print('successfully generate files from %s' %filepath)
    return

In [3]:
filepath = 'dataset/MathQA/train.json'
generate_files(filepath, 'train')

filepath = 'dataset/MathQA/test.json'
generate_files(filepath, 'test')

dataset/MathQA4project/00_train.csv
successfully generate files from dataset/MathQA/train.json
dataset/MathQA4project/00_test.csv
successfully generate files from dataset/MathQA/test.json


In [4]:
# show the dataframe read from the file
filepath = 'dataset/MathQA4project/00_train.csv'
df = pd.read_csv(filepath, index_col=0)
df

Unnamed: 0,Problem,correct,category,option_a,option_b,option_c,option_d,option_e
0,the banker ' s gain of a certain sum due 3 yea...,a,gain,rs . 400,rs . 300,rs . 500,rs . 350,none of these
1,average age of students of an adult school is ...,d,general,1200,120,360,240,none of these
2,sophia finished 2 / 3 of a book . she calculat...,b,general,229,270,877,266,281
3,120 is what percent of 50 ?,b,gain,5 %,240 %,50 %,2 %,500 %
4,there are 10 girls and 20 boys in a classroom ...,a,other,1 / 2,1 / 3,1 / 5,10 / 30,2 / 5
...,...,...,...,...,...,...,...,...
29832,a man invests some money partly in 10 % stock ...,a,other,1 : 2,3 : 5,4 : 5,16 : 15,none
29833,"average of 10 matches is 32 , how many runs on...",d,general,a ) 70,b ) 76,c ) 78,d ) 98,e ) 88
29834,solve below question 2 x + 1 = - 23,c,general,- 8,- 9,- 12,- 4,12
29835,calculate 85184 ÷ ? = 352,b,general,241,242,244,247,240


In [5]:
# show the dataframe read from the file
filepath = 'dataset/MathQA4project/00_test.csv'
df = pd.read_csv(filepath, index_col=0)
df

Unnamed: 0,Problem,correct,category,option_a,option_b,option_c,option_d,option_e
0,a shopkeeper sold an article offering a discou...,a,gain,38,27.675,30,data inadequate,none of these
1,what will be the difference between simple and...,a,gain,129,130,124,133,145
2,there are 28 stations between hyderabad and ba...,c,physics,156,167,870,352,380
3,the present population of a town is 3888 . pop...,e,gain,2500,2100,3500,3600,2700
4,the triplicate ratio of 1 : 9 is ?,e,other,1 : 0,1 : 8,1 : 7,1 : 2,1 : 729
...,...,...,...,...,...,...,...,...
2980,find the area of a parallelogram with base 20 ...,c,geometry,100 cm 2,250 cm 2,800 cm 2,296 cm 2,456 cm 2
2981,"in a garden , there are 10 rows and 14 columns...",e,physics,20 m,22 m,24 m,26 m,28 m
2982,a can do a piece of work in 6 hours ; b and c ...,d,physics,8 hours,6 hours,14 hours,12 hours,5 hours
2983,a train 250 m long running at 72 kmph crosses ...,a,physics,350 m,200 m,250 m,270 m,300 m


## make files with categories

`generate_cat_files(filepath, c)`: input filepath and then have subset with categories, c is the type of "train" or "test"

In [6]:
def generate_cat_files(filepath, c):
    # generate the files with category
    
    data = pd.read_csv(filepath, index_col=0)
    
    # separate the data with categories and save as a file
    cat_label = ['gain', 'general', 'geometry', 'physics', 'probability', 'other']
    for i in cat_label:
        data_cat = data[data['category']==i].reset_index(drop=True)
        fp = 'dataset/MathQA4project/'+c+'_'+i+'.csv'
        data_cat.to_csv(fp)
    
    print('successfully generate files in %s with categories'%filepath)

In [7]:
# train data with categories

filepath = 'dataset/MathQA4project/00_train.csv'
generate_cat_files(filepath, 'train')

successfully generate files in dataset/MathQA4project/00_train.csv with categories


In [8]:
# test data with categories

filepath = 'dataset/MathQA4project/00_test.csv'
generate_cat_files(filepath, 'test')

successfully generate files in dataset/MathQA4project/00_test.csv with categories
