In [1]:
#penn treebank
import nltk
import re
out = nltk.download('treebank')

from nltk.corpus import treebank

[nltk_data] Downloading package treebank to
[nltk_data]     /home/hyohyeongjang/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [22]:
class Node():
    def __init__(self):
        self.parent = None
        self.value = None
        self.daughters = []
        self.command = None
    
    def setParent(self, node):
        self.parent = node
        
    def setValue(self, node):
        self.value = node
        
    def setdaughters(self, node):
        self.daughters = node
    
    
class ConstituencyTree():
    def __init__(self):
        self.nodes = []
        self.sentences = []
        self.sentences.append(self.head)
    
    def makeTree(self, string_list):
        curnode = Node()
        self.sentences.append(curnode)
        
        for token in string_list:
            if token == "(":
                
                n = Node()
                n.parent = curnode
                curnode.daughters.append(n)
                curnode = n
                
            if token == ")":
                curnode = curnode.parent
            
            if token != ")" and token != "(":
                curnode.value = token
                self.nodes.append(curnode)
        
        
            
    def decode(self, node):
        print(node.value)
        
        if len(node.daughters) == 0:
            return 
        
        for daughtersNode in node.daughters:
            
            self.decode(daughtersNode)
            
    
    def getSubordinateNodes(self, node):
        lst = []
        
        if len(node.daughters) == 0:
            return [node]
        else:
            for i in node.daughters:
                lst.append(self.getSubordinateNodes(i))
        
        if node in self.nodes:
            return [node]
        
        return lst
    

        
    def getparser(self):
        for node in self.nodes:
            termnodes = [self.getSubordinateNodes(i) for i in node.parent.daughters]
            node.command = termnodes
            
            
def flattenList(nested_list):
    flat_list = []
    for element in nested_list:
        if isinstance(element, list):
            flat_list.extend(flattenList(element))
        else:
            flat_list.append(element)
    return flat_list


In [50]:
class ConstituencyParser():
    def __init__(self):
        pass
    
    @staticmethod
    def preprocessing(raw_string = treebank.raw()):
        lst = []

        x = re.sub("\n", "", raw_string)
        x = re.sub(r'\([a-zA-Z]+-{0,1}[a-zA-z]{0,}-{0,1}\d{0,}\$? ', "(", x)
        x = re.sub(r"\([-.,:\"\`\"\'?@!`]", "(", x)
        x = re.sub(" ", "", x)
        x = re.sub("\(\*[a-zA-Z]{0,1}\*{0,1}[a-zA-Z]{0,1}-[0-9]+\)", "", x)
        x = re.sub("NONE[^\)]*\)", ")", x)

        preprocessed = x
        
        lst = []
        open_count = 0
        string = ""
        
        for i in preprocessed:
            string += i
            if i == "(":
                open_count += 1
            if i == ")":
                open_count -= 1

            if open_count == 0:
                lst.append(string)
                string = ""
                
        return lst
        
    @staticmethod
    def tokenize(preprocessed):
       
        tokenized = [re.sub("[\(\)]+", " ", i).split(" ") for i in preprocessed]
        tokenized = [i[1:] for i in tokenized]
        return tokenized
    
    def parse(preprocessed, tree, idx = None):
        if idx != None:
            temp_list = [re.sub(r"([\(\)])", r" \1 ", i) for i in preprocessed[:idx]]
        else:
            temp_list = [re.sub(r"([\(\)])", r" \1 ", i) for i in preprocessed[0]]
        temp_list = [re.sub(r"\s+", " ", i) for i in temp_list]
        temp_list = [i.strip() for i in temp_list]
        temp_list = [i.split(" ") for i in temp_list]

        tree.makeTree(temp_list[1])
        tree.getparser()

        for node in tree.nodes:
            print(node.value, [i.value for i in flattenList(node.command)])
        return tree

In [51]:
preprocessed = ConstituencyParser.preprocessing(treebank.raw())
tokenized = ConstituencyParser.tokenize(preprocessed[:10000])
tree = ConstituencyTree()
parsed = ConstituencyParser.parse(preprocessed, tree, 3)


Mr. ['Mr.', 'Vinken']
Vinken ['Mr.', 'Vinken']
is ['is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group']
chairman ['chairman']
of ['of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group']
Elsevier ['Elsevier', 'N.V.']
N.V. ['Elsevier', 'N.V.']
, ['Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group']
the ['the', 'Dutch', 'publishing', 'group']
Dutch ['the', 'Dutch', 'publishing', 'group']
publishing ['the', 'Dutch', 'publishing', 'group']
group ['the', 'Dutch', 'publishing', 'group']
. ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']


In [23]:
tree.head.daughter[0].daughter[0].daughter[0].daughter[0].value

'Vinken'

In [98]:
lst[0]

'(((((Pierre)(Vinken))(,)(((61)(years))(old))(,))((will)((join)((the)(board))((as)((a)(nonexecutive)(director)))((Nov.)(29))))(.)))'

In [12]:
lst1[0]

['(',
 '(',
 '(',
 '(',
 '( Pierre )',
 '( Vinken )',
 ')',
 '( , )',
 '(',
 '(',
 '( 61 )',
 '( years )',
 ')',
 '( old )',
 ')',
 '( , )',
 ')',
 '(',
 '( will )',
 '(',
 '( join )',
 '(',
 '( the )',
 '( board )',
 ')',
 '(',
 '( as )',
 '(',
 '( a )',
 '( nonexecutive )',
 '( director )',
 ')',
 ')',
 '(',
 '( Nov. )',
 '( 29 )',
 ')',
 ')',
 ')',
 '( . )',
 ')',
 ') ']

In [45]:
tree.head.daughter[0].value

'((((Pierre'

19

In [9]:
lst1 = []
for x in lst:

#     out = re.sub("[\-.,:`\"\'?@!`", "", out)
    
    lst1.append(out)
    

KeyboardInterrupt: 

In [None]:
lst1

In [107]:
re.sub("\(\*[a-zA-Z]{0,1}\*{0,1}[a-zA-Z]{0,1}-[0-9]+\)", "", "(*-1)")

''