# Tutorial 

## Part-III

In this live session we will **(a)** reorganize the input as tokenized into analyses, **(b)** parse each analysis into its root and a stream of affixes (another level of tokenization), **(c)** parse the stream into separate morphemes (yet another), **(d)** extract a set of morphemes and **(e)** export the results in *csv* format.

### * imports

In [None]:
import glob
from pprint import pprint

### ** globals

In [None]:
input_path = "./in/"
output_path = "./out/"

### *** read input

In [None]:
def read_and_store_input(input_path):
    # initialize dictionary
    data = {}
    # read file names from the input directory
    input_files = glob.glob(input_path+'*.txt')
    for file_path in input_files:
        content = open(file_path, 'r')
        # create an empty list with a key as the name of the current file
        file_name = file_path.split('/')[-1]
        data[file_name]=[]
        # populate the list with content from file
        for line in content:
            # check if line has content
            if line.strip() != "":
                data[file_name].append(line)
    return data

In [None]:
data = read_and_store_input(input_path)

## database

**current**: 
```
    {   
        file_1:[line_1, ..., line_n],
        ...,
        file_m:[line_1, ..., line_k]
    }
```

**envisioned**:
```
    {
        file_name: {
                        word_index: {
                                        analysis_index: {
                                                            'root'  :root,
                                                            'aff'   :[affixes],
                                                            'pos'   :pos_tag,
                                                            'proper':bool
                                                        }
                                    }
                    }      
    }
```

### (a-1) tokenize lines into streams of analyses 
note that each stream corresponds to a word in the original data; associated **delimiter** is a `single (white)space`, or, `' '`.


In [None]:
def tokenizer_l2w(data):
    # 'l' for line, 'w' for words
    # traverse data
    for file, content in data.items():
        word_index = 0
        words = {}
        for line in content:
            # split() = split(' ')
            for word in line.split():
                words[word_index] = word
                word_index += 1
        # make changes in situ, i.e. update
        data[file] = words

In [None]:
tokenizer_l2w(data)

### (a-2) tokenize streams into individual analyses
associated **delimiter** is a `forward slash`, or, `'/'`.

In [None]:
def tokenizer_w2a(data):
    # 'w' for word, 'a' for analyses
    # traverse data
    for file, words in data.items():
        for word_index, word in words.items():
            analyses = {}
            for index, analysis in enumerate(word.split('/')):
                analyses[index] = analysis
            # make changes in situ, i.e. update
            data[file][word_index] = analyses

In [None]:
tokenizer_w2a(data)

In [None]:
for key, values in data.items():
    print(values[4][1])
    break

### (b-1) tokenize each analysis into a tuple: (morp, is_proper)
associated **delimiter** is a `complex custom string`, in this case, `'+[Proper='`. for why, please refer to the manual of the morphological analyzer used in tutorial part II ([here](https://github.com/google-research/turkish-morphology)).

In [None]:
def tokenizer_a2t(data):
    # 'a' for analysis, 't' for tuple
    # traverse data
    for file, words in data.items():
        for word_index, word in words.items():
            for analysis_index, analysis in word.items():
                morphology, is_proper = analysis.split('+[Proper=')
                is_proper = is_proper.strip(']')
                if is_proper == 'False':
                    is_proper = False
                elif is_proper == 'True':
                    is_proper = True
                else:
                    raise ValueError('unexpected value for var=is_proper')
                tup = (morphology, is_proper)
                # make changes in situ, i.e. update
                data[file][word_index][analysis_index] = tup

In [None]:
tokenizer_a2t(data)

In [None]:
for key, values in data.items():
    print(values[4][1])
    break

### (b-2, c) tokenize morphology into roots, affixes and PoS tags
associated **delimiters** are `plus`, `minus` and `forward square bracket` characters, or, `'+'`, `'-'` and `'['` respectively.

In [None]:
def init_values():
    values = {
                'root':None,
                'affixes':None,
                'pos':None,
                'is_proper':None
            }
    return values

def tokenizer_m2c(data):
    # 'm' for morphology, 'c' for category
    # traverse data
    for file, words in data.items():
        for word_index, word in words.items():
            for analysis_index, analysis in word.items():
                morphology, is_proper = analysis
                # remove parantheses from morph. analysis
                # this will remove grouping information, ref. manual
                morphology = ''.join(ch for ch in morphology if ch not in '()')
                # split morphology
                parts = [part for chunk in morphology.split('+') for part in chunk.split('-')]
                root_and_pos = parts[0]
                root = root_and_pos.split('[')[0]
                pos = root_and_pos.split('[')[1].strip(']')
                # control if the analysis yielded any affixes
                if len(parts) > 1:
                    affixes = parts[1:]
                else:
                    affixes = None
                #fill in values
                values = init_values()
                values['root'] = root
                values['affixes'] = affixes
                values['pos'] = pos
                values['is_proper'] = is_proper
                # make changes in situ, i.e. update
                data[file][word_index][analysis_index] = values

In [None]:
tokenizer_m2c(data)

In [None]:
for key, values in data.items():
    pprint(values[4][1])
    print('\n')
    pprint(values[4][1]['affixes'])
    break


In [None]:
pprint(data)

### (d) extract a set of roots and affixes

In [None]:
def catalog(data):
    cat = {'roots':[], 'affixes':[]}
    for file, words in data.items():
        for w_index, word in words.items():
            for a_index, analysis in word.items():
                root = analysis['root']
                if root not in cat['roots']:
                    cat['roots'].append(root)
                affixes = analysis['affixes']
                # check if morph. analysis yielded any affixes
                if affixes:
                    for affix in affixes:
                        if affix not in cat['affixes']:
                            cat['affixes'].append(affix)
    return cat  

In [None]:
cat = catalog(data)

In [None]:
pprint(cat)

### (e-1) export the data in *csv* format
let the fields be `'file_name'`, `'word_index'`, `'analysis_index'`, `'root'`, `'pos'`, `'is_prop'` and `morpheme_types`

In [None]:
fields = ['file_index', 'word_index', 'analysis_index', 'root', 'pos', 'is_prop'] + ['afx:'+affix for affix in cat['affixes']]

In [None]:
pprint(fields)

In [None]:
def to_csv(data, fields, affix_cats, output_path):
    csv = open(output_path+'data.csv','w')
    for file_name, content in data.items():
        f_index = file_name.split('_')[0]
        # init file
        fields_line = ','.join(field for field in fields)
        csv.write(fields_line+'\n')
        for w_index, word in content.items():
            for a_index, analysis in word.items():
                root = analysis['root']
                pos = analysis['pos']
                is_proper = str(int(analysis['is_proper']))
                affixes = analysis['affixes']
                if not affixes:
                    affixes = []
                csv.write(f_index+',')
                csv.write(str(w_index)+',')
                csv.write(str(a_index)+',')
                csv.write(root+',')
                csv.write(pos+',')
                csv.write(is_proper+',')
                for cat in affix_cats:
                    if cat in affixes:
                        csv.write('1,')
                    else:
                        csv.write('0,')
                csv.write('\n')
    csv.close()

In [None]:
to_csv(data, fields, cat['affixes'], output_path)

### (e-2) export the catalog into two files 

In [None]:
def to_catalogs(catalog, output_path):
    
    root_cat = open(output_path+'root_catalog.txt','w')
    for root in catalog['roots']:
        root_cat.write(root+'\n')
    root_cat.close()
    
    afx_cat = open(output_path+'affix_catalog.txt','w')
    for afx in catalog['affixes']:
        afx_cat.write(afx+'\n')
    afx_cat.close()

In [None]:
to_catalogs(cat, output_path)