In [1]:
import numpy as np
import pandas as pd
import pickle
from timeit import default_timer as timer
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
import nltk as nltk

import re

import os

In [2]:
dirname = os.getcwd()
dirname = os.path.dirname(dirname)
dataset_path = os.path.join(dirname, 'datasets/')
print(dataset_path)

/media/kandy/hdd/master-thesis/constituency-parsing/datasets/


### Read the corpus and extract the trees

In [9]:
load_treeData = pd.read_csv(dataset_path+'constituency-parsing-data-all-UNK.csv', sep=' ', header=None, )
load_treeData.columns =['sentence', 'tree']
load_treeData['tree'] = load_treeData['tree'].apply(nltk.Tree.fromstring)

load_treeData.head()

Unnamed: 0,sentence,tree
0,pierre </s> </s> </s> years old </s> will join...,"[[[(</s> (</s> pierre) (</s> vinken)), (</s> <..."
1,</s> </s> is chairman </s> </s> </s> </s> the ...,"[[[(</s> mr</s>), (</s> vinken)], [(</s> is), ..."
2,</s> </s> </s> </s> years old </s> former chai...,"[[[(</s> (</s> rudolph) (</s> agnew)), (</s> <..."
3,</s> form </s> asbestos once used * * </s> mak...,[[[(</s>\n (</s>\n (</s> (</s> a) (</s> fo...
4,the asbestos fiber </s> </s> </s> is unusually...,[[[(</s>\n (</s> (</s> the) (</s> asbestos) (...


### Extract the word tokens

In [10]:
trees = load_treeData['tree'].values.tolist()
word_tokens = set()
for tree in trees:
    for word in tree.leaves():
        word_tokens.add(word)
        #word_tokens.add(word.lower()) # lower casing resulted in 44377 word tokens
print(len(word_tokens))


45162


### Get the Google word2vec vocab

In [11]:
#googleVocab = pd.read_csv(dataset_path+'google-vocab.txt', sep=' ', header=None)
#googleVocab.columns = ['word', 'index']
#googleVocab.head()

outfile = dataset_path +'google_word_corpus.pic'

with open(outfile, 'rb') as pickle_file:    
    googleCorpus, google_corpus_word_to_int, google_corpus_int_to_word = pickle.load(pickle_file)
    
google_vocab_set = set(googleCorpus)
print(len(google_vocab_set))

1965716


### Nearly 33% of words are missing. Casing does not make substantial difference

In [12]:
diff_wj_google = word_tokens.difference(google_vocab_set)
print(len(diff_wj_google))
#print(diff_wj_google)
print(float(len(diff_wj_google)) * 100 / float(len(word_tokens)), '% of words are missing!!!' )
# lower casing resulted a difference of 13278 tokens

15195
33.64554271290022 % of words are missing!!!


### Get the Glove vectors

In [13]:
outfile = dataset_path +'glove_word_corpus.pic'

with open(outfile, 'rb') as pickle_file:    
    gloveCorpus, glove_corpus_word_to_int, glove_corpus_int_to_word = pickle.load(pickle_file)
    
gloveCorpus = set(gloveCorpus)
print(len(gloveCorpus))

1783088


### With Glove nearly 90% of words are missing. Good to go with google word2vec

In [14]:
diff_wj_glove = word_tokens.difference(gloveCorpus)
print(len(diff_wj_glove))
#print(diff_wj_google)
print(float(len(diff_wj_glove)) * 100 / float(len(word_tokens)), '% of words are missing!!!' )
# lower casing resulted a difference of 13278 tokens

39480
87.4186262787299 % of words are missing!!!
