In [117]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import collections
import random
import sys
from operator import itemgetter

In [79]:
import os

path = os.path.join("data","en-ud-train.conllu")

def read(path):
    '''Constructs a list(corpus) of lists(sentences) of dicts(relevant information per word)'''
    with open(path, 'r', encoding = 'utf8') as f:
        
        data =[]
        
        # Needed to select sentences
        first_sent = True  
        previous_line_hashtag = False
        
        #linenumber for debugging
        linenumber = 0
        
        for line in f:
            
            #empty linedict
            linedict = {}
            linenumber +=1
            
            # Skip newdoc id and skip sentence id. Skip whitelines
            if line[0] in ['#','r\n','\n']:
                if line[0] in ['#']:
                    previous_line_hashtag = True                    
                continue
                
            
            # empty list for sentence
            if (line[0] == '1' and previous_line_hashtag == True):
                previous_line_hashtag = False
                if first_sent == True:
                    sentence = []
                    first_sent = False
                else:
                    data.append(sentence)
                    sentence = []
            
            # strip tabs
            parts = line.strip().split("\t") 
            
            # Make sure that all lines have the same structure 
            if len(parts) != 10:
                print(linenumber)
                print(parts)
                print('not same structure')
                break
            
            # fill linedict
            for i, part in enumerate(parts):
                
                # only save the relevant parts
                if i not in [0,1,3,6,7]:
                    continue
                linedict[i] = part
            
            sentence.append(linedict)

        # Check if it parsed all lines
        print('lines parsed: ',linenumber, 'of the 242774')
        return data

eng_train = read(path)
print('Number of sentences:', len(eng_train),'\n')
print('Example of datastructure: \n\n', eng_train[0:2])

lines parsed:  242774 of the 242774
Number of sentences: 12542 

Example of datastructure: 

 [[{0: '1', 1: 'Al', 3: 'PROPN', 6: '0', 7: 'root'}, {0: '2', 1: '-', 3: 'PUNCT', 6: '1', 7: 'punct'}, {0: '3', 1: 'Zaman', 3: 'PROPN', 6: '1', 7: 'flat'}, {0: '4', 1: ':', 3: 'PUNCT', 6: '1', 7: 'punct'}, {0: '5', 1: 'American', 3: 'ADJ', 6: '6', 7: 'amod'}, {0: '6', 1: 'forces', 3: 'NOUN', 6: '7', 7: 'nsubj'}, {0: '7', 1: 'killed', 3: 'VERB', 6: '1', 7: 'parataxis'}, {0: '8', 1: 'Shaikh', 3: 'PROPN', 6: '7', 7: 'obj'}, {0: '9', 1: 'Abdullah', 3: 'PROPN', 6: '8', 7: 'flat'}, {0: '10', 1: 'al', 3: 'PROPN', 6: '8', 7: 'flat'}, {0: '11', 1: '-', 3: 'PUNCT', 6: '8', 7: 'punct'}, {0: '12', 1: 'Ani', 3: 'PROPN', 6: '8', 7: 'flat'}, {0: '13', 1: ',', 3: 'PUNCT', 6: '8', 7: 'punct'}, {0: '14', 1: 'the', 3: 'DET', 6: '15', 7: 'det'}, {0: '15', 1: 'preacher', 3: 'NOUN', 6: '8', 7: 'appos'}, {0: '16', 1: 'at', 3: 'ADP', 6: '18', 7: 'case'}, {0: '17', 1: 'the', 3: 'DET', 6: '18', 7: 'det'}, {0: '18', 1: '

In [54]:
# Construct sentences from the training data, default (for development purposes) is 1 sentence
def construct_sentences(corpus, n=1):
    '''Constructs a list of dicts(sentences) with words and indices of the words (and the <root> symbol)'''
    
    sentences = []
    n=3

    
    for sent in corpus[0:n]:
        # create mappings from index to words for faster computability
        i2words = {}
        
        for i, word in enumerate(sent):
#             Append 'root word'
            if word[0]=='1':
                i2words[i] = '<root>'
                
            i2words[i+1] = word[1]
            
        sentences.append(i2words)
    return sentences
        
        
sentences = construct_sentences(eng_train)
print(sentences)
    

[{0: '<root>', 1: 'Al', 2: '-', 3: 'Zaman', 4: ':', 5: 'American', 6: 'forces', 7: 'killed', 8: 'Shaikh', 9: 'Abdullah', 10: 'al', 11: '-', 12: 'Ani', 13: ',', 14: 'the', 15: 'preacher', 16: 'at', 17: 'the', 18: 'mosque', 19: 'in', 20: 'the', 21: 'town', 22: 'of', 23: 'Qaim', 24: ',', 25: 'near', 26: 'the', 27: 'Syrian', 28: 'border', 29: '.'}, {0: '<root>', 1: '[', 2: 'This', 3: 'killing', 4: 'of', 5: 'a', 6: 'respected', 7: 'cleric', 8: 'will', 9: 'be', 10: 'causing', 11: 'us', 12: 'trouble', 13: 'for', 14: 'years', 15: 'to', 16: 'come', 17: '.', 18: ']'}, {0: '<root>', 1: 'DPA', 2: ':', 3: 'Iraqi', 4: 'authorities', 5: 'announced', 6: 'that', 7: 'they', 8: 'had', 9: 'busted', 10: 'up', 11: '3', 12: 'terrorist', 13: 'cells', 14: 'operating', 15: 'in', 16: 'Baghdad', 17: '.'}]


In [118]:

def construct_graph(sentences, n=1):
    '''Constructs a list of dicts(sentences) where each sentence is represented as a graph,
    inspired by the Python documentation recommended structure (https://www.python.org/doc/essays/graphs/). 
    
    graphs = [graph_of_sentence_1, graph_of_sentnce_2,...]
    
    Where each graph has the structure:
        
    graph_i = {to (index of a word within a sentence): [(from, weight), (from, weight)],
             2: [(1, 0.5323), (3, 0.3452345),....,(n, weight)],
             ...}
    
             
    '''
    
    graphs = []
    graph = {}
    n=2
    
    # Create (fully conected) graph 
    # TODO: (with random weights at the moment, should change that to the weights given by the Neural Net)
    for sent in sentences[0:n]:
        
        length = len(sent)  
        for to_index, word in sent.items():
            
            # Make sure there is nog connection from a node to itself
            #TODO: Volgens mij hoeven we maar 1 keer de probabilities van elke arc er in te stoppen (dit wordt niet geupdate)
            # Daarom gebruik ik hieronder tuples, maar anders moeten we misschien iets anders gebruiken
            from_index = [i for i in range(length) if i != to_index]
            graph[to_index] = [(i: random.random()) for i in from_index]
            
        graphs.append(graph)
        
    return graphs
             
graphs = construct_graph(sentences)
print(graphs[:1])

SyntaxError: invalid syntax (<ipython-input-118-366fa06800b3>, line 31)

In [112]:
def MST(graphs, n=1):
    # n=1 for development purposes
    '''Takes a list of graphs and returns the Maximum Spanning Tree per graph by using the Chu Liu Edmonds Algorithm'''

    for sentence in graphs[0:n]:
        # find maximum incomming arcs
        for word_index, incomming_arcs in sentence.items():
            
            break
        
        # find all cycles
        

        
        
MST(graphs)


NameError: name 'dictionary' is not defined