## Debug for Scientific data
[PeerRead](https://github.com/allenai/PeerRead) (accept/reject)

In [193]:
import os
import logging
import numpy as np
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

In [220]:
def load_peer_read(tokenize=True, 
                   lower=True, 
                   shuffle=True, 
                   random_state=42):
    """
    Details here
    """
    
    import json
    
    def get_parsed(path, data_mode, label_mode):
        """
        get the file names
        """
        parsed_data = {}
        parsed_label = {}

        for key,item in data_mode.items():
            parsed_data[key] = glob.glob1(os.path.join(path, item),
                                         '*')
        
        for key,item in label_mode.items():
            parsed_label[key] = glob.glob1(os.path.join(path, item),
                                          '*')
            
        meta = [(key, len(item)) for key,item in parsed_data.items()]

        return meta, parsed_data, parsed_label
    
    def get_data(meta, data, label, data_mode, label_mode, mode=None):
        """
        
        # Args
            mode: {'train', 'test', 'dev'}
        # returns
            x, y
        
        """
        if mode is not None:
            x = []
            y = []
            for k in data.keys():
                for i in range(len(data[k][mode])):
                    with open(os.path.join(path[k], data_mode[mode], data[k][mode][i])) as json_file:  
                        file = json.load(json_file) 
                        if file['metadata']['sections'] is not None:
                            temp = ' '
                            for key in file['metadata']['sections']:
                                if key['heading'] is not None:
                                    temp = ' '.join([temp, '{} {}'.format(key['heading'], key['text'].strip())])
                    x.append(temp.replace('\n', ' '))
                    
                    # TODO : check if both filename are the same
                    with open(os.path.join(path[k], label_mode[mode], label[k][mode][i])) as json_file:  
                        file = json.load(json_file) 
                        print(file['reviews'])
                        y.append(file['accepted'])
                    
            return x, y
        else:
            raise ValueError('mode doesn\'t exists')
            
    path = {}
    path['acl'] = './dataset/PeerRead/data/acl_2017'
    path['iclr'] = './dataset/PeerRead/data/iclr_2017'
    path['coNLL'] = './dataset/PeerRead/data/conll_2016'
    path['ai'] = './dataset/PeerRead/data/arxiv.cs.ai_2007-2017'
    path['cl'] = './dataset/PeerRead/data/arxiv.cs.cl_2007-2017'
    path['lg'] = './dataset/PeerRead/data/arxiv.cs.lg_2007-2017'
    data_mode = {'train' : 'train/parsed_pdfs/',
             'test' : 'test/parsed_pdfs/',
             'dev' : 'dev/parsed_pdfs/'}
    label_mode = {'train' : 'train/reviews/',
             'test' : 'test/reviews/',
             'dev' : 'dev/reviews/'}

    meta, data, label = {}, {}, {}
    meta['acl'], data['acl'], label['acl'] = get_parsed(path['acl'], data_mode, label_mode)
    meta['iclr'], data['iclr'], label['iclr'] = get_parsed(path['iclr'], data_mode, label_mode)
    meta['coNLL'], data['coNLL'], label['coNLL'] = get_parsed(path['coNLL'], data_mode, label_mode)
    meta['ai'], data['ai'], label['ai'] = get_parsed(path['ai'], data_mode, label_mode)
    meta['cl'], data['cl'], label['cl'] = get_parsed(path['cl'], data_mode, label_mode)
    meta['lg'], data['lg'], label['lg'] = get_parsed(path['lg'], data_mode, label_mode)
    
    X_train_corpus, y_train = get_data(meta, data, label, data_mode, label_mode, 'train')
    X_test_corpus, y_test = get_data(meta, data, label, data_mode, label_mode, 'test')
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        
        indices = np.random.permutation(len(y_test))
        
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
        logging.info('Shuffled.')
    
    if lower:
        X_train_corpus = [text.lower() for text in X_train_corpus]
        X_test_corpus = [text.lower() for text in X_test_corpus]
        logging.info('Lowered.')
        
    if tokenize:
        X_train_corpus = [word_tokenize(text) for text in X_train_corpus]
        X_test_corpus = [word_tokenize(text) for text in X_test_corpus]
        logging.info('Tokenized.')
    
    return X_train_corpus, X_test_corpus


In [221]:
x_train, x_test = load_peer_read()

[{'IMPACT': '3', 'SUBSTANCE': '4', 'APPROPRIATENESS': '5', 'MEANINGFUL_COMPARISON': '3', 'PRESENTATION_FORMAT': 'Poster', 'comments': '- Strengths:\n\nThe paper addresses a long standing problem concerning automatic evaluation of\nthe output of generation/translation systems.\n\nThe analysis of all the available metrics is thorough and comprehensive.\n\nThe authors demonstrate a new metric with a higher correlation with human\njudgements\n\nThe bibliography will help new entrants into the field.\n\n- Weaknesses:\n\nThe paper is written as a numerical analysis paper, with very little insights\nto linguistic issues in generation, the method of generation, the differences\nin the output from a different systems and human generated reference.\n\nIt is unclear if the crowd source generated references serve well in the\ncontext of an application that needs language generation.\n\n- General Discussion:\n\nOverall, the paper could use some linguistic examples (and a description of the\ndiffere

KeyError: 'accepted'

In [190]:
len(x_train), len(x_test)

(11090, 637)

In [None]:
dict_keys(['reviews', 'abstract', 'histories', 'id', 'title']