In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
import operator

In [2]:
# parse txt file to make pandas dataframe
def parse_file(filename):
    D = [] # Documents
    Q = [] # Queries
    C = [] # Candidates
    A = [] # Answers
    with open(filename, 'r') as f:
        curr_Doc = ''
        for line in f.readlines():
            if len(line.strip()) == 0:
                continue
            temp = list(filter(None, re.split('[\n \t \s]', line)))
            if not temp[0].isdigit():
                raise ValueError('Incorrect Input')
            if int(temp[0]) != 21:
                curr_Doc += ' ' + ' '.join(temp[1:])
            else:
                D.append(curr_Doc)
                curr_Doc = ''
                Q.append(' '.join(temp[1:-2]))
                A.append(temp[-2])
                C.append(temp[-1])
    Docs = pd.Series(D, dtype=str, name='Document')
    Quers = pd.Series(Q, dtype=str, name='Query')
    Cands = pd.Series(C, dtype=str, name='Candidates')
    Anss = pd.Series(A, dtype=str, name='Answer')
    return pd.concat([Docs, Quers, Cands, Anss], axis=1)

In [3]:
%%time
# make datasets for Common Nouns queries
raw_CN_train = parse_file('data/cbtest_CN_train.txt')
raw_CN_valid = parse_file('data/cbtest_CN_valid_2000ex.txt')
raw_CN_test = parse_file('data/cbtest_CN_test_2500ex.txt')

CPU times: user 29.6 s, sys: 1.07 s, total: 30.7 s
Wall time: 34 s


In [4]:
raw_CN_train.head()

Unnamed: 0,Document,Query,Candidates,Answer
0,With almost everything else to make them happ...,replied the XXXXX ; for the king 's aunts were...,ancestors|baby|boy|everyone|fairies|mother|por...,queen
1,With almost everything else to make them happ...,replied the queen ; for the XXXXX 's aunts wer...,aunts|baby|king|monarch|mother|occasions|princ...,king
2,With almost everything else to make them happ...,replied the queen ; for the king 's XXXXX were...,ancestors|aunts|books|breakfast|cats|children|...,aunts
3,This vexed the king even more than the queen ...,`` They are very kind old ladies in their way ...,aunts|boots|boy|cat|cats|child|grandmother|kin...,king
4,This vexed the king even more than the queen ...,`` They are very kind old ladies in their way ...,aunts|boots|boy|breakfast|cat|foot|lady|mother...,boy


Now we'll tokenize the sentences into the lists of words.

In [5]:
regex = re.compile('[^a-zA-Z , .]')
def tokenize_row(row):
    modified_row = regex.sub(' ', row)
    bag_of_words = list(filter(None, modified_row.lower().split(' ')))
    return bag_of_words
def tokenize_data(df):
    df['Document'] = df['Document'].apply(tokenize_row)
    df['Query'] = df['Query'].apply(tokenize_row)
    df['Candidates'] = df['Candidates'].apply(lambda row: row.lower().split('|'))
    df['Answer'] = df['Answer'].apply(lambda row: row.lower())
    return df

In [6]:
%%time
raw_CN_train = tokenize_data(raw_CN_train)
raw_CN_valid = tokenize_data(raw_CN_valid)
raw_CN_test = tokenize_data(raw_CN_test)

CPU times: user 13.4 s, sys: 608 ms, total: 14 s
Wall time: 14.1 s


In [7]:
raw_CN_train.head()

Unnamed: 0,Document,Query,Candidates,Answer
0,"[with, almost, everything, else, to, make, the...","[replied, the, xxxxx, for, the, king, s, aunts...","[ancestors, baby, boy, everyone, fairies, moth...",queen
1,"[with, almost, everything, else, to, make, the...","[replied, the, queen, for, the, xxxxx, s, aunt...","[aunts, baby, king, monarch, mother, occasions...",king
2,"[with, almost, everything, else, to, make, the...","[replied, the, queen, for, the, king, s, xxxxx...","[ancestors, aunts, books, breakfast, cats, chi...",aunts
3,"[this, vexed, the, king, even, more, than, the...","[they, are, very, kind, old, ladies, in, their...","[aunts, boots, boy, cat, cats, child, grandmot...",king
4,"[this, vexed, the, king, even, more, than, the...","[they, are, very, kind, old, ladies, in, their...","[aunts, boots, boy, breakfast, cat, foot, lady...",boy


Now let's get some statistics about dataset.

In [8]:
def show_statistics(train_df, valid_df):
    print('Number of samples:')
    print('\t Train:', train_df.shape[0])
    print('\t Valid:', valid_df.shape[0])
    train_words = train_df['Document'].tolist()
    print('Average number of words:')
    print('\t In Documents', 
          round(sum([len(sublist) for sublist in train_words]) / train_df.shape[0],1))
    train_words = train_df['Query'].tolist()
    print('\t In Queries',
         round(sum([len(sublist) for sublist in train_words]) / train_df.shape[0],1))
    train_words += train_df['Document'].tolist()
    train_words += train_df['Candidates'].tolist() + train_df['Answer'].tolist()
    # flatten words from different rows
    train_words = [item for sublist in train_words for item in sublist]
    train_words_freqs = dict(Counter(train_words))
    print('Words statistics:')
    print('Train:')
    print('\t Total number of words:', len(train_words_freqs))
    train_words_freqs = sorted(train_words_freqs.items(), 
                               key=operator.itemgetter(1),reverse=True)[:10]
    print('\t Ten most frequent words:', train_words_freqs)
    
    valid_words = valid_df['Document'].tolist() + valid_df['Query'].tolist()
    valid_words += valid_df['Candidates'].tolist() + valid_df['Answer'].tolist()
    valid_words = [item for sublist in valid_words for item in sublist]
    valid_words_freqs = dict(Counter(valid_words))
    print('Valid:')
    print('\t Total number of words:', len(valid_words_freqs))
    print('\t Number of new words:', len(set(valid_words) - set(train_words)))
    valid_words_freqs = sorted(valid_words_freqs.items(), 
                               key=operator.itemgetter(1),reverse=True)[:10]
    print('\t Ten most frequent words:', valid_words_freqs)

In [9]:
show_statistics(raw_CN_train, raw_CN_valid)

Number of samples:
	 Train: 120769
	 Valid: 2000
Average number of words:
	 In Documents 452.8
	 In Queries 29.6
Words statistics:
Train:
	 Total number of words: 42247
	 Ten most frequent words: [(',', 3965617), ('the', 3027817), ('.', 2168428), ('and', 2126483), ('to', 1368783), ('a', 1155806), ('of', 1072559), ('he', 935216), ('was', 752051), ('i', 733825)]
Valid:
	 Total number of words: 8983
	 Number of new words: 380
	 Ten most frequent words: [(',', 57575), ('the', 52315), ('.', 36092), ('and', 33627), ('to', 24447), ('he', 17570), ('a', 16601), ('of', 15130), ('was', 11801), ('i', 10777)]


In [10]:
# saves datasets to pdf. 
# Warning - during extraction, one should use literal_eval from ast for the first three columns.
raw_CN_train.to_csv('data/CBT_CN_train.csv',sep=';',index=False)
raw_CN_valid.to_csv('data/CBT_CN_valid.csv',sep=';',index=False)
raw_CN_test.to_csv('data/CBT_CN_test.csv',sep=';',index=False)
del raw_CN_train
del raw_CN_valid
del raw_CN_test

Now we'll do the same steps for other three classes.

In [12]:
%%time
# make datasets for Named Entities queries
raw_NE_train = parse_file('data/cbtest_NE_train.txt')
raw_NE_valid = parse_file('data/cbtest_NE_valid_2000ex.txt')
raw_NE_test = parse_file('data/cbtest_NE_test_2500ex.txt')

CPU times: user 30.5 s, sys: 3.44 s, total: 34 s
Wall time: 3min 22s


In [14]:
raw_NE_train = tokenize_data(raw_NE_train)
raw_NE_valid = tokenize_data(raw_NE_valid)
raw_NE_test = tokenize_data(raw_NE_test)

In [15]:
show_statistics(raw_NE_train, raw_NE_valid)

Number of samples:
	 Train: 108719
	 Valid: 2000
Average number of words:
	 In Documents 417.2
	 In Queries 27.1
Words statistics:
Train:
	 Total number of words: 47537
	 Ten most frequent words: [(',', 3152050), ('the', 2158358), ('.', 1955513), ('and', 1641299), ('to', 1116485), ('a', 958834), ('of', 867286), ('he', 726103), ('i', 680264), ('was', 622257)]
Valid:
	 Total number of words: 9321
	 Number of new words: 499
	 Ten most frequent words: [(',', 50315), ('the', 39156), ('.', 36248), ('and', 27244), ('to', 21299), ('he', 16616), ('a', 14993), ('of', 14726), ('was', 12083), ('i', 10404)]


In [16]:
# saves datasets to pdf. 
# Warning - during extraction, one should use literal_eval from ast for the first three columns.
raw_NE_train .to_csv('data/CBT_NE_train.csv',sep=';',index=False)
raw_NE_valid.to_csv('data/CBT_NE_valid.csv',sep=';',index=False)
raw_NE_test.to_csv('data/CBT_NE_test.csv',sep=';',index=False)
del raw_NE_train 
del raw_NE_valid
del raw_NE_test

In [17]:
%%time
# make datasets for Verbs queries
raw_V_train = parse_file('data/cbtest_V_train.txt')
raw_V_valid = parse_file('data/cbtest_V_valid_2000ex.txt')
raw_V_test = parse_file('data/cbtest_V_test_2500ex.txt')

CPU times: user 29.8 s, sys: 2.56 s, total: 32.4 s
Wall time: 1min


In [18]:
raw_V_train = tokenize_data(raw_V_train)
raw_V_valid = tokenize_data(raw_V_valid)
raw_V_test = tokenize_data(raw_V_test)
# saves datasets to pdf. 
# Warning - during extraction, one should use literal_eval from ast for the first three columns.
raw_V_train.to_csv('data/CBT_V_train.csv',sep=';',index=False)
raw_V_valid.to_csv('data/CBT_V_valid.csv',sep=';',index=False)
raw_V_test.to_csv('data/CBT_V_test.csv',sep=';',index=False)
del raw_V_train 
del raw_V_valid
del raw_V_test

In [None]:
%%time
# make datasets for Prepositions queries
raw_P_train = parse_file('data/cbtest_P_train.txt')

In [None]:
raw_P_valid = parse_file('data/cbtest_P_valid_2000ex.txt')
raw_P_test = parse_file('data/cbtest_P_test_2500ex.txt')

In [None]:
raw_P_train = tokenize_data(raw_P_train)
raw_P_valid = tokenize_data(raw_P_valid)
raw_P_test = tokenize_data(raw_P_test)
show_statistics(raw_P_train, raw_P_valid)
# saves datasets to pdf. 
# Warning - during extraction, one should use literal_eval from ast for the first three columns.
raw_P_train.to_csv('data/CBT_V_train.csv',sep=';',index=False)
raw_P_valid.to_csv('data/CBT_V_valid.csv',sep=';',index=False)
raw_P_test.to_csv('data/CBT_V_test.csv',sep=';',index=False)
del raw_P_train 
del raw_P_valid
del raw_P_test