In [48]:
import gzip
import numpy as np
import random
import os

from collections import Counter, defaultdict, namedtuple
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [37]:
# Syntactic Features

# We use the shortest path between an entity mention and the question word in the dependency tree
# as input to the first channel.

"""
parser example: 'Who did shaq play for'

[(('play', 'VB'), 'aux', ('did', 'VBD')),
 (('play', 'VB'), 'nsubj', ('shaq', 'NNP')),
 (('play', 'VB'), 'prep', ('for', 'IN')),
 (('for', 'IN'), 'pobj', ('Who', 'WP'))]
 
"""
import nltk
from nltk.parse.stanford import StanfordDependencyParser

def set_parser(path_to_jar, path_to_models_jar):
    """
    imports parser and sets parser just once for future use
    """
    
    dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

    return dependency_parser
    
    
def get_tree(dependency_parser, sentence):
    """
    uses the dependency parser imported in set_parser to parse the given sentence into a tree
    returns a list of triples like the following:
    
    parser example: 'Who did shaq play for'

    [(('play', 'VB'), 'aux', ('did', 'VBD')),
     (('play', 'VB'), 'nsubj', ('shaq', 'NNP')),
     (('play', 'VB'), 'prep', ('for', 'IN')),
     (('for', 'IN'), 'pobj', ('Who', 'WP'))]

    """
    print("parsing sentence: ", sentence)
    result = dependency_parser.raw_parse(sentence)
    dep = result.__next__()
    
    return list(dep.triples())


def get_shortest_path(tree, mention, question_word, verbose = False):
    """
    this function gets the shortest path from a parsed tree from running
    
    result = dependency_parser.raw_parse('Who did shaq play for')
    dep = result.__next__()
    tree = list(dep.triples())
    
    mention = entity_word
    question_word = Who/Where/Why/How
    
    example of using this function:
    
    mention1 = 'shaq'
    mention2 = 'Who'
    result = dependency_parser.raw_parse('Who did shaq play for')
    dep = result.__next__()

    word_q, label_path, edge_path = get_shortest_path(list(dep.triples()), mention1, mention2)

    returns word_path, label_path and edge_path from two mentioned words
   
    """
    label_path = []
    edge_path = []
    visited = [0]*len(tree)
    
    def find_word(curr_w):
        if verbose: print("looking for word: ", curr_w)
        if curr_w == question_word: return True
        for idx, s in enumerate(tree):
            if visited[idx] == 0 and s[0][0] == curr_w:
                if verbose: print('found word here: ', s)
                if s[0][0] not in word_q:
                    word_q.append(s[0][0])
                label_path.append(s[1])
                edge_path.append('left')
                word_q.append(s[2][0])
                visited[idx] = 1
                return True
            if visited[idx] == 0 and s[2][0] == curr_w:
                if verbose: print('found word here: ', s)
                if s[2][0] not in word_q:
                    word_q.append(s[2][0])
                label_path.append(s[1])
                edge_path.append('right')
                word_q.append(s[0][0])
                visited[idx] = 1
                return True
        if verbose: print("couldn't find word: ", curr_w)
        return False
    
    word_q = [mention]
    
    T = True
    while T:        
        k = word_q[-1]
        if k == question_word: 
            T = False
        if find_word(k) == False: 
            if verbose: print('getting rid of word: ', k)
            word_q.pop()
            
    return word_q, label_path, edge_path
          
    
    
def extract_feature(word_q, label_path, edge_path, mention1, mention2):
    """
    this function takes in parsed word_path, label_path, edge_path from the above function and 
    returns list of features to be represented using GloVe and to be fed into the model as our features
    
    treats the path as a concatenation of vectors or words, dependency edge directions and dependency labels
    and feed it to the convolution layer
    
    extracting features from word_path, label_path, edge_path
    example:
    
    inputs: (from calling the above function)
    
    word_q: ['shaq', 'play', 'for', 'Who']
    label_path: ['nsubj', 'aux', 'prep', 'pobj']
    edge_path: ['right', 'left', 'left', 'left']
    
    output: ['left', 'nsubj', 'middle', 'play', 'middle', 'for', 'middle', 'pobj', 'right']

    """
    entities = set([mention1, mention2])
    path = ['left']
    for idx, val in enumerate(word_q):
        if val in entities:
            path.append(label_path[idx])
            if idx == 0:
                path.append('middle')
        else:
            path.append(val)
            path.append('middle')
    path.append('right')
    assert len(path) == len(word_q) * 2+1
    return path
    
    
    
def load_GloVe(gloveFile):
    """
    loads pre_trained glove vectors
    
    example file name: 
    /Users/hujiayu/Documents/GitHub/cs224u/vsmdata/glove.6B/glove.6B.50d.txt
    
    can use one of the following dimensions:
    
    glove.6B.50d.txt
    glove.6B.100d.txt
    glove.6B.200d.txt
    glove.6B.300d.txt
    
    returns a dictionary with 
    key = word, and value = vector representation

    """
    print("Loading GloVe vectors..")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model
    
    
    
def get_word_repre(features, glove_model, dim):
    """
    This function takes in a list of words and represent them using 
    word vector representations from GloVe
    append to feature vector a random normal initialized vector is not found in glove
    
    example input: 
    
    path = ['left', 'nsubj', 'middle', 'play', 'middle', 'for', 'middle', 'pobj', 'right']
    
    returns np array matrix of dimension len(features) x dimension of GloVe used
    
    """
    res = []
    
    for w in features:
        try:
            res.append(glove_model[w])
        except KeyError:
            print('word %s is not in GloVe' % w)
            if w in mapping:
                print('adding word representation for %s instead '% convert_words(mapping, w))
                res.append(glove_model[convert_words(mapping, w)])
            else:
                # create random normal vector
                res.append(np.random.normal((1,dim)))
    return np.array(res)


def convert_words(mapping, word):
    """
    since many of the syntax words cannot be found in glove
    convert to more general form such as nsubj - subject, and pobj - object
    """
    return mapping[word]
    

In [39]:
# This cell uses the functions defined above and calls for syntactic features

path_to_jar = '/Users/hujiayu/Downloads/stanford-parser-full-2014-08-27/stanford-parser.jar'
path_to_models_jar = '/Users/hujiayu/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models.jar'

mapping = {'nsubj':'subject', 'pobj':'object', 'prep':'preposition', 'aux':'auxiliary'}

gloveFile = '/Users/hujiayu/Documents/GitHub/cs224u/vsmdata/glove.6B/glove.6B.50d.txt'
dim = 50

sentence = 'Who did shaq first play for?'
mention1 = 'shaq'
mention2 = 'Who'

if __name__ == '__main__':
    
    dependency_parser = set_parser(path_to_jar, path_to_models_jar)
    
    tree = get_tree(dependency_parser, sentence)
    
    word_q, label_path, edge_path = get_shortest_path(tree, mention1, mention2, verbose = False)

    
    # print(word_q)
    # print(label_path)
    # print(edge_path)

    features = extract_feature(word_q, label_path, edge_path, mention1, mention2)
#     print(features)
    glove_model = load_GloVe(gloveFile)
    syntactic_feature = get_word_repre(features, glove_model, dim)
    print('syntactic feature vector is of shape: ', feature_vec.shape)

parsing sentence:  Who did shaq first play for?
Loading GloVe vectors..
Done. 400000  words loaded!
word nsubj is not in GloVe
adding word representation for subject instead 
word pobj is not in GloVe
adding word representation for object instead 
syntactic feature vector is of shape:  (9, 50)


In [42]:
# Sentential Features
# According to paper, this channel takes the words in the sentence as input excluding the question 
# word and the entity mention. For example above: who did shaq first play for ? 
# the vectors for did, first, play and for are fed into this channel.

def sentential_feature(sentence, glove_model, mention1, mention2):
    """
    this function takes in the raw sentence, pre_trained glove model, and two mention words
    and outputs the glove word representation of all words in the sentence except the
    mention words
    assume input is a full sentence: 'Who did shaq first play for'
    """
    res = []
    entities = set([mention1, mention2])
    if sentence[-1] == '?':sentence = sentence[:-1]
    sentence = sentence.split()
    for w in sentence:
        if w not in entities:
            res.append(glove_model[w])
    res = np.array(res)
    print('sentential feature is of shape: ', res.shape)
    return res

sentential_feature = sentential_feature(sentence, glove_model, mention1, mention2)

sentential feature is of shape:  (4, 50)


In [3]:
# this cell shows some code snippets examples for using the packages


# result = dependency_parser.raw_parse('I shot an elephant in my sleep')
# Example: [Who] did [shaq] first play for ?

result = dependency_parser.raw_parse('Who did shaq first play for')
dep = result.__next__()
list(dep.triples())

[(('play', 'VB'), 'aux', ('did', 'VBD')),
 (('play', 'VB'), 'nsubj', ('shaq', 'NNP')),
 (('play', 'VB'), 'prep', ('for', 'IN')),
 (('for', 'IN'), 'pobj', ('Who', 'WP'))]

In [47]:
# MCCNNs for relation classification 
# now we combine both features and feed into a convolutional network

# According to the paper, Convolution layer tackles an input of varying length returning a fixed length vector (we
# use max pooling) for each channel. These fixed length vectors are concatenated and then fed into a
# softmax classifier, the output dimension of which is equal to the number of predefined relation types.
# The value of each dimension indicates the confidence score of the corresponding relation

combined_feature = np.concatenate((sentential_feature, syntactic_feature), axis = 0)
combined_feature.shape
# but we combine these two features after convolutional layer not before ! 
# will work on this over the weekend - 

(13, 50)