# Part II

### a) imports

In [32]:
import glob
from string import punctuation

### b) globals

In [33]:
input_path = "./in/"
output_path = "./out/"

### c) read input

In [34]:
def read_and_store_input(input_path):
    
    # initialize dictionary
    data = {}
    
    # read file names from the input directory
    input_files = glob.glob(input_path+'output_of_first/*.txt_prep.')
    
    for file_path in input_files:
        content = open(file_path, 'r')
        # create an empty list with a key as the name of the current file
        file_name = file_path.split('/')[-1]
        data[file_name]=[]
        # populate the list with content from file
        for line in content:
            # check if line has content
            if line.strip() != "":
                data[file_name].append(line)
    return data


In [35]:
data = read_and_store_input(input_path)

In [24]:
data

{'03.txt_prep.': ['Plato or a was an Athenian philosopher during the Classical period in Ancient Greece founder of the Platonist school of thought and the Academy the first institution of higher learning in the Western world \n',
  'He is widely considered the pivotal figure in the history of Ancient Greek and Western philosophy along with his teacher Socrates and his most famous student aristotle \n',
  'Plato has also often been cited as one of the founders of Western religion and spirituality The socalled Neoplatonism of philosophers like Plotinus and Porphyry greatly influenced Christianity through Church Fathers such as augustine Alfred North Whitehead once noted the safest general characterization of the European philosophical tradition is that it consists of a series of footnotes to plato Plato was the innovator of the written dialogue and dialectic forms in philosophy Plato is also considered the founder of Western political philosophy His most famous contribution is the theory

### d) third party program import process for dependency parsing

In [25]:
# import sys
import spacy
# !{sys.executable} -m spacy download en

In [26]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [36]:
def tokenize(line):
    line = ''.join(ch for ch in line if ch not in punctuation)
    # tokenize a string by assuming all white spaces as word boundaries
    words = line.split()
    return words

In [37]:
def tokenizer(data):
    # traverse data
    for file, content in data.items():
        for index, line in enumerate(content):
            words = tokenize(line)
            # make changes in situ, i.e. update
            content[index] = words

In [38]:
tokenizer(data)

In [39]:
def dependency_parsing(data):
    new_data = {}
    for file, content in data.items():
        new_data[file] = []
        for line in content:
            new_line = ""
            for index, word in enumerate(line):
                new_line += "SURFACE_FORM: " + word
                doc = nlp(word)
                for token in doc: 
                    new_line += " " + "ANALYSIS: " + token.tag_ + " " + token.pos_ + " " + spacy.explain(token.tag_) + "\n"
                    if not doc:
                        doc = "(%s[NOT_ANALYZED])" %(word)                   
                line[index] = doc            
            if new_line.strip() != "":
                new_data[file].append(new_line)                
    return new_data


In [40]:
new_data = (dependency_parsing(data))
print(new_data)

{'03.txt_prep.': ['SURFACE_FORM: Plato ANALYSIS: NNP PROPN noun, proper singular\nSURFACE_FORM: or ANALYSIS: CC CCONJ conjunction, coordinating\nSURFACE_FORM: a ANALYSIS: LS X list item marker\nSURFACE_FORM: was ANALYSIS: VBD AUX verb, past tense\nSURFACE_FORM: an ANALYSIS: LS X list item marker\nSURFACE_FORM: Athenian ANALYSIS: NNP PROPN noun, proper singular\nSURFACE_FORM: philosopher ANALYSIS: NN NOUN noun, singular or mass\nSURFACE_FORM: during ANALYSIS: IN ADP conjunction, subordinating or preposition\nSURFACE_FORM: the ANALYSIS: DT DET determiner\nSURFACE_FORM: Classical ANALYSIS: JJ ADJ adjective\nSURFACE_FORM: period ANALYSIS: NN NOUN noun, singular or mass\nSURFACE_FORM: in ANALYSIS: IN ADP conjunction, subordinating or preposition\nSURFACE_FORM: Ancient ANALYSIS: JJ ADJ adjective\nSURFACE_FORM: Greece ANALYSIS: NNP PROPN noun, proper singular\nSURFACE_FORM: founder ANALYSIS: NN NOUN noun, singular or mass\nSURFACE_FORM: of ANALYSIS: IN ADP conjunction, subordinating or prepos

### e) merge the instances, (i.e. word into lines)

In [41]:
def merge(data):
    for file, content in new_data.items():
        for index, line in enumerate(content):
            line = ''.join(word for word in line)
            # make changes in situ, i.e. update
            content[index] = line

In [42]:
merge(data)

### f) export the results

In [43]:
def export(data):
    for file_name, content in new_data.items():
        output_file = open(output_path+file_name+'_dep_pars.', 'w')
        for line in content:
            output_file.write(line+'\n')
        output_file.close()

In [44]:
export(data)