**Preprocessing notebook for the CNN dataset


NOTE: The notebook contains two options for downloading and preprocessing the dataset.
Choose either Option 1, or Option 2.

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
keys = ['train', 'dev', 'test']

In [1]:
# OPTION 1 - Commented out, because dataset is not available at specified location

#!wget http://cs.stanford.edu/~danqi/data/cnn.tar.gz
#!tar -xvzf cnn.tar.gz

--2019-04-15 19:22:45--  http://cs.stanford.edu/~danqi/data/cnn.tar.gz
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/~danqi/data/cnn.tar.gz [following]
--2019-04-15 19:22:45--  https://cs.stanford.edu/~danqi/data/cnn.tar.gz
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 571560380 (545M) [application/x-gzip]
Saving to: ‘cnn.tar.gz’


2019-04-15 19:23:35 (11.1 MB/s) - ‘cnn.tar.gz’ saved [571560380/571560380]

cnn/
cnn/dev.txt
cnn/train.txt
cnn/test.txt


In [4]:
# Preprocessing for Option 1

# Commented out because dataset is no longer available in this format

# This is the preprocessing code to use if all questions have been concatenated into three files, with
# respectively the train, dev and test questions.
# So run EITHER this cell, or the next.

#p, q, a = {}, {}, {}
#for k in keys :
#    file = open('cnn/questions/' + k + '.txt').read().strip().split('\n\n')
#    file = [x.split('\n') for x in file]
#    p[k] = [x[2] for x in file]
#    q[k] = [x[0] for x in file]
#    a[k] = [x[1] for x in file]

In [15]:
# Option 2

# Download the dataset from https://cs.nyu.edu/~kcho/DMQA/
# Unpack the cnn.tar.gz file
# The questions should be in a folder named 'questions', with subfolders for the train, dev and test datasets

# Then run the preprocessing code below

assert os.path.isdir('cnn/questions'), "Please download and unpack the dataset from https://cs.nyu.edu/~kcho/DMQA/"

In [18]:
# Preprocessing for option 2

# This is alternative preprocessing code, in case the dataset is in original form.
# Original form means each example (in train, dev and test set) is in a separate file, 
# in distinct subfolders within the cnn folder.
# Each example has the following structure info, separated by two newlines:
#  - source of the cnn article (url)
#  - the paragraph
#  - the question
#  - the answer
#  - list of entities e.g. @entity128:Royal Malewane, separated by a newline
# The preprocessing goes throuigh all the files, and collects the text, question and answer in p, k and a.

p, q, a = {}, {}, {}

for k in keys :
    p[k] = []
    q[k] = []
    a[k] = []
    
    # get all the file names; each file contains one example
    files = os.listdir('cnn/questions/' + k)
    print("\n", k, ": contains ", len(files), " examples")
    
    # loop through all the files, and extract text (p), question (q) and answer (a)
    c = 0
    for f in files:
        print("\r", c, end=' ')
        file = open('cnn/questions/' + k + '/' + f).read().strip().split('\n\n')
        p[k].append(file[1])
        q[k].append(file[2])
        a[k].append(file[3])
        c += 1

print("\ndone reading the files")


 train : contains  380298  examples
 11672 

KeyboardInterrupt: 

In [8]:
entities = {}

# Loop over the keys in p (train, dev, test)
for k in p :
    entities[k] = []
    
    # loop over all paragraphs in this set
    for x in p[k] :
        
        # split the article in words, and add all the words that start with '@entity' to list
        entities[k] += [y for y in x.split() if y.startswith('@entity')]

    # convert to set --> remove duplicates
    entities[k] = set(entities[k])

# Write entities to file
f = open('entity_list.txt', 'w')
f.write('\n'.join(list(entities['train'])))
f.close()

In [11]:
def generate_possible_answers(p) :
    " Returns a string with the possible answers in paragraph p, seperated by semicolons
    "
    possible_answers = []
    for w in p.split() :
        if w.startswith('@entity') :
            possible_answers.append(w)
    
    return ";".join(list(set(possible_answers)))

In [12]:
import pandas as pd
df_paragraphs = []
df_questions = []
df_answers = []
df_possible_answers = []
df_exp_splits = []

for k in keys :
    df_paragraphs += p[k]
    df_questions += q[k]
    df_answers += a[k]
    df_possible_answers += [generate_possible_answers(x) for x in p[k]]
    df_exp_splits += [k] * len(p[k])
    
df = {'paragraph' : df_paragraphs, 'question' : df_questions, 'answer' : df_answers, 
      'exp_split' : df_exp_splits, 'possible_answers' : df_possible_answers}
df = pd.DataFrame(df)

In [13]:
df.to_csv('cnn_dataset.csv', index=False)

In [1]:
%run "../preprocess_data_QA.py" --data_file cnn_dataset.csv --output_file ./vec_cnn.p --all_answers_file entity_list.txt \
--word_vectors_type fasttext.simple.300d --min_df 8 --add_answers_to_vocab

Vocabulary size :  70187
entity2index {'@entity0': 0, '@entity1': 1, '@entity10': 2, '@entity100': 3, '@entity101': 4, '@entity102': 5, '@entity103': 6, '@entity104': 7, '@entity105': 8, '@entity106': 9, '@entity107': 10, '@entity108': 11, '@entity109': 12, '@entity11': 13, '@entity110': 14, '@entity111': 15, '@entity112': 16, '@entity113': 17, '@entity114': 18, '@entity115': 19, '@entity116': 20, '@entity117': 21, '@entity118': 22, '@entity119': 23, '@entity12': 24, '@entity120': 25, '@entity121': 26, '@entity122': 27, '@entity123': 28, '@entity124': 29, '@entity125': 30, '@entity126': 31, '@entity127': 32, '@entity128': 33, '@entity129': 34, '@entity13': 35, '@entity130': 36, '@entity131': 37, '@entity132': 38, '@entity133': 39, '@entity134': 40, '@entity135': 41, '@entity136': 42, '@entity137': 43, '@entity138': 44, '@entity139': 45, '@entity14': 46, '@entity140': 47, '@entity141': 48, '@entity142': 49, '@entity143': 50, '@entity144': 51, '@entity145': 52, '@entity146': 53, '@entity

Found 35237 words in model out of 70190
