# Tutorial 

## Part-II

In this live session we will **(a)** tokenize the input into words, **(b)** morphologically analyze each instance, **(c)** merge the instances conforming to the scope of the original data points and **(d)** export accordingly. Input will be the output of Part-I (Turkish).


### * imports


In [None]:
# for reading file names in a directory
import glob
from string import punctuation
import subprocess

### ** globals

In [None]:
input_path = "./in/"
output_path = "./out/"

### *** read input

In [None]:
def read_and_store_input(input_path):
    # initialize dictionary
    data = {}
    # read file names from the input directory
    input_files = glob.glob(input_path+'*.txt')
    for file_path in input_files:
        content = open(file_path, 'r')
        # create an empty list with a key as the name of the current file
        file_name = file_path.split('/')[-1]
        data[file_name]=[]
        # populate the list with content from file
        for line in content:
            # check if line has content
            if line.strip() != "":
                data[file_name].append(line)
    return data

In [None]:
data = read_and_store_input(input_path)

### (a) tokenize each line into words

In [None]:
def tokenize(line):
    # remove punctuation
    # note that important information about sentence boundaries will be lost; what could be a better solution?
    line = ''.join(ch for ch in line if ch not in punctuation)
    # tokenize a string by assuming all white spaces as word boundaries
    words = line.split()
    return words

def tokenizer(data):
    # traverse data
    for file, content in data.items():
        for index, line in enumerate(content):
            words = tokenize(line)
            # make changes in situ, i.e. update
            content[index] = words


In [None]:
tokenizer(data)

### (b) morphological analysis

for this part a two-level [morphological analyzer](https://github.com/google-research/turkish-morphology) is used from google-research

In [None]:
def analyzer(word):
        # the command that calls a third party morphological analyzer (MA) on a target word
        command = "bazel run -c opt scripts:print_analyses -- --word=%s" %(word)
        # the path of the root directory of the third party MA
        directory = "/home/gorgo/Tools/turkish-morphology-master/"
        analysis = subprocess.run(command.split(), capture_output=True, cwd=directory).stdout.decode().split('\n')[1]
        return analysis

def analyze(data):
    # traverse data
    for file, content in data.items():
        for line in content:
            for index, word in enumerate(line):
                print("SURFACE_FORM: %s" %(word))
                analysis = analyzer(word)
                if not analysis:
                    analysis = "(%s[NON_WORD])" %(word)
                print("ANALYSIS: %s\n" %(analysis))
                # make changes in situ, i.e. update
                line[index] = analysis

In [None]:
analyze(data)

In [None]:
# lets see what we have
for file, content in data.items():
    for li, line in enumerate(content):
        for wi, word in enumerate(line):
            print("LINE:%s WORD:%s --> %s\n" %(li+1, wi+1, word))


### (c) merge the instances (i.e. words into lines)

In [None]:
def merge(data):
    for file, content in data.items():
        for index, line in enumerate(content):
            line = ' '.join(word for word in line)
            # make changes in situ, i.e. update
            content[index] = line

In [None]:
merge(data)

### (d) export the results

In [None]:
def export(data):
    for file_name, content in data.items():
        name, ext = file_name.split('.')
    output_file = open(output_path+name+'_ma.'+ext, 'w')
    for line in content:
        output_file.write(line+'\n')
    output_file.close()

In [None]:
export(data)