In [None]:
import os
from nltk.parse import CoreNLPParser
from nltk.tree import ParentedTree, Tree
from conllu.conllu import parse_single

In [161]:
class ExtractTriples(object):

    VERBS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']

    def __init__(self, filepath_to_conll=None):
        self.filepath_to_conll = None
        self.tokenlist = None
        self.tokenTree = None
        if filepath_to_conll:
            self.filepath_to_conll = filepath_to_conll
            data_file = open(filepath_to_conll, "r")
            self.tokenlist = parse_single(data_file)
            self.tokenTree = self.tokenlist[0].to_tree()
            print(self.tokenTree)


    def find_subject(self, t): 
        for s in t.subtrees(lambda t: t.label() == 'NP'):
            for n in s.subtrees(lambda n: n.label().startswith('NN')):
                #print(n[0], self.find_attrs(n))
                return (n[0], self.find_attrs(n))
                # return n[0]
                
    def find_predicate(self, t):    
        v = None
        
        for s in t.subtrees(lambda t: t.label() == 'VP'):
            for n in s.subtrees(lambda n: n.label().startswith('VB')):
                v = n
            return (v[0], self.find_attrs(v))
            # return v[0]
        
    def find_object(self,t):    
        for s in t.subtrees(lambda t: t.label() == 'VP'):
            for n in s.subtrees(lambda n: n.label() in ['NP', 'PP', 'ADJP']):
                if n.label() in ['NP', 'PP']:
                    for c in n.subtrees(lambda c: c.label().startswith('NN')):
                        return (c[0], self.find_attrs(c))
                        # return c[0]
                else:
                    for c in n.subtrees(lambda c: c.label().startswith('JJ')):
                        return (c[0], self.find_attrs(c))
                        # return c[0]
                    
    def find_attrs(self, node):
        attrs = []
        p = node.parent()
        
        # Search siblings
        if node.label().startswith('JJ'):
            for s in p:
                if s.label() == 'RB':
                    attrs.append(s[0])
                    
        elif node.label().startswith('NN'):
            for s in p:
                if s.label() in ['DT','PRP$','POS','JJ','CD','ADJP','QP','NP','NNP']:
                    attrs.append(' '.join(s.flatten()))
        
        elif node.label().startswith('VB'):
            for s in p:
                if s.label() == 'ADVP':
                    attrs.append(' '.join(s.flatten()))
                    
        # Search uncles
        if node.label().startswith('JJ') or node.label().startswith('NN'):
            for s in p.parent():
                if s != p and s.label() == 'PP':
                    attrs.append(' '.join(s.flatten()))
                    
        elif node.label().startswith('VB'):
            for s in p.parent():
                if s != p and s.label().startswith('VB'):
                    attrs.append(s[0])
                    
        return attrs

    def main(self, sentence):
        subject = self.find_subject(sentence)
        predicate = self.find_predicate(sentence)
        object_ = self.find_object(sentence)
        return (subject, predicate, object_)
        
    

In [162]:
textraction = ExtractTriples()

In [163]:
#sentences=['Barack Hussein Obama II (US /bəˈrɑːk huːˈseɪn oʊˈbɑːmə/; born August 4, 1961) is an American politician who is the 44th and current President of the United States. He is the first African American to hold the office and the first president born outside the continental United States. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School between 1992 and 2004. While serving three terms representing the 13th District in the Illinois Senate from 1997 to 2004, he ran unsuccessfully in the Democratic primary for the United States Hou']
from nltk.tokenize import sent_tokenize
text = "Barack Hussein Obama II is an American politician who is the 44th and current President of the United States. He is the first African American to hold the office and the first president born outside the continental United States. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School between 1992 and 2004. While serving three terms representing the 13th District in the Illinois Senate from 1997 to 2004, he ran unsuccessfully in the Democratic primary for the United States Hou"
#print(sent_tokenize(text))

parser = CoreNLPParser()
for sent in list(sent_tokenize(text)):
    tree = list(parser.raw_parse(sent))[0]
    print(sent)
    ptree=ParentedTree.convert(tree)
    triple= textraction.main(ParentedTree.convert(tree))
    print(triple)

Barack Hussein Obama II is an American politician who is the 44th and current President of the United States.
((u'Barack', [u'Barack', u'Hussein', u'Obama', u'II']), (u'is', []), (u'politician', [u'an', u'American']))
He is the first African American to hold the office and the first president born outside the continental United States.
((u'office', [u'the']), (u'born', []), (u'office', [u'the']))
Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review.
((u'Honolulu', [u'Honolulu', u'Hawaii']), (u'Born', []), (u'Honolulu', [u'Honolulu', u'Hawaii']))
He was a community organizer in Chicago before earning his law degree.
((u'community', [u'a', u'in Chicago']), (u'earning', []), (u'community', [u'a', u'in Chicago']))
He worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School between 1992 and 2004.
((u'rights', [u'a', u'civil']), (u'taught', []), (u'rights', 

In [164]:
tree = list(parser.raw_parse("Barack Hussein Obama II is an American politician who is the 44th and current President of the United States."))[0]
#tree = list(parser.raw_parse(sent))[0]
print(tree)

(ROOT
  (S
    (NP (NNP Barack) (NNP Hussein) (NNP Obama) (NNP II))
    (VP
      (VBZ is)
      (NP
        (NP (DT an) (JJ American) (NN politician))
        (SBAR
          (WHNP (WP who))
          (S
            (VP
              (VBZ is)
              (NP
                (NP
                  (DT the)
                  (JJ 44th)
                  (CC and)
                  (JJ current)
                  (NN President))
                (PP (IN of) (NP (DT the) (NNP United) (NNPS States)))))))))
    (. .)))
