In [13]:
### This program is a very simple lemmatizer, which learns a
### lemmatization function from an annotated corpus. The function is
### so basic I wouldn't even consider it machine learning: it's
### basically just a big lookup table, which maps every word form
### attested in the training data to the most common lemma associated
### with that form. At test time, the program checks if a form is in
### the lookup table, and if so, it gives the associated lemma; if the
### form is not in the lookup table, it gives the form itself as the
### lemma (identity mapping).

### The program performs training and testing in one run: it reads the
### training data, learns the lookup table and keeps it in memory,
### then reads the test data, runs the testing, and reports the
### results.

### The program takes two command line arguments, which are the paths
### to the training and test files. Both files are assumed to be
### already tokenized, in Universal Dependencies format, that is: each
### token on a separate line, each line consisting of fields separated
### by tab characters, with word form in the second field, and lemma
### in the third field. Tab characters are assumed to occur only in
### lines corresponding to tokens; other lines are ignored.

import sys
import re

### Global variables

# Paths for data are read from command line
#train_file = sys.argv[1]
#test_file = sys.argv[2]
train_file = './data/hi_hdtb-ud-train.conllu'
test_file = './data/hi_hdtb-ud-test.conllu'

# Counters for lemmas in the training data: word form -> lemma -> count
lemma_count = {}
duplicated_lemma_list, lemma_list = {}, {}
duplicated_form_list, form_list = {}, {}

# Lookup table learned from the training data: word form -> lemma
lemma_max = {}

# Variables for reporting results
training_stats = ['Wordform types' , 'Wordform tokens' , 'Unambiguous types' , 'Unambiguous tokens' , 'Ambiguous types' , 'Ambiguous tokens' , 'Ambiguous most common tokens' , 'Identity tokens']
training_counts = dict.fromkeys(training_stats , 0)

test_outcomes = ['Total test items' , 'Found in lookup table' , 'Lookup match' , 'Lookup mismatch' , 'Not found in lookup table' , 'Identity match' , 'Identity mismatch']
test_counts = dict.fromkeys(test_outcomes , 0)

accuracies = {}

### Training: read training data and populate lemma counters

train_data = open (train_file , 'r')
#test_data = open (test_file , 'r')

for line in train_data:
    
    # Tab character identifies lines containing tokens
    if re.search ('\t' , line):

        # Tokens represented as tab-separated fields
        field = line.strip().split('\t')

        # Word form in second field, lemma in third field
        form = field[1]
        lemma = field[2]
        
        # Form -> lemma
        if form not in duplicated_form_list: # Add new form
            duplicated_form_list[form] = [lemma]
        else:
            duplicated_form_list[form].append(lemma)
        
        # Lemma -> form
        if lemma not in duplicated_lemma_list:  # Add new lemma
            duplicated_lemma_list[lemma] = [form]
        else:
            duplicated_lemma_list[lemma].append(form)
            
        # Form -> lemma (no duplicates)
        if form not in form_list: # Add new form
            form_list[form] = [lemma]
        else:
            if lemma not in form_list[form]: # For a lemma if form -> lemma exist do not add
                form_list[form].append(lemma)
        
        # Lemma -> form (no duplicates)
        if lemma not in lemma_list: # Add new lemma
            lemma_list[lemma] = [form]
        else:
            if form not in lemma_list[lemma]: # For a form if lemma -> form exist do not add
                lemma_list[lemma].append(form)
            
plural_types = [key for key, value in form_list.items() if len(value) > 1] # Get forms which have plural tokens
singular_types = [key for key, value in form_list.items() if len(value) == 1] # Get forms which have singular tokens

all_tokens = 0
for value in duplicated_lemma_list.values():
    all_tokens = all_tokens + len(value)

singular_tokens = 0
for types in singular_types:
    if types in duplicated_form_list:
        singular_tokens = singular_tokens + len(duplicated_form_list[types])

plural_tokens = 0
duplicated_ambiguous_tokens = {}
ambiguous_tokens = {}
for types in plural_types:
    if types in duplicated_form_list:
        plural_tokens = plural_tokens + len(duplicated_form_list[types])
        duplicated_ambiguous_tokens[types] = duplicated_form_list[types]
        ambiguous_tokens[types] = form_list[types]

most_common_tokens = {}
for key, values in duplicated_form_list.items():
    token_frequency = {}
    for value in values:
        if value not in token_frequency:
            token_frequency[value] = 0
        else:
            token_frequency[value] += 1
    most_common_tokens[key] = max(token_frequency, key = token_frequency.get)

ambiguous_most_common = {}
common_token_count = 0
for key, value in most_common_tokens.items():
    for keys in duplicated_ambiguous_tokens.keys():
        if key == keys:
            values = duplicated_ambiguous_tokens[keys]
            #print()
            for i in range(len(values)):
                if value == values[i]:
                    #print(str(len(values)) + '중 ' + str(i + 1) + '번째 비교 : '+ key + ' : ' + value + ' / ' + values[i] + 'MATCH!!!!   token : ' + value)
                    ambiguous_most_common[key] = value
                    common_token_count = common_token_count + 1
                    #print(ambiguous_most_common)
                #print(str(len(values)) + '중 ' + str(i + 1) + '번째 비교 : '+ key + ' : ' + value + ' / ' + values[i])

training_counts['Wordform types'] = len(form_list.keys())
training_counts['Wordform tokens'] = all_tokens
training_counts['Unambiguous types'] = len(form_list.keys()) - len(plural_types)
training_counts['Unambiguous tokens'] = singular_tokens
training_counts['Ambiguous types'] = len(plural_types)
training_counts['Ambiguous tokens'] = plural_tokens
training_counts['Ambiguous most common tokens'] = common_token_count


        ######################################################
        ### Insert code for populating the lemma counts    ###
        ######################################################

{'Wordform types': 16879, 'Wordform tokens': 281057, 'Unambiguous types': 16465, 'Unambiguous tokens': 196204, 'Ambiguous types': 414, 'Ambiguous tokens': 84853, 'Ambiguous most common tokens': 75667, 'Identity tokens': 0}


In [None]:

rev_dict = {}

for key, value in lemma_count.items():
    if value not in rev_dict:
        rev_dict[value] = [key]
    else:
        rev_dict[value].append(key)

print(len(rev_dict))

result = [key for key, values in rev_dict.items() if len(values) > 1]
print(len(result))

In [None]:
a = {'a' : 10, 'b' : [20, 30]}
print(len(a))
print(len(a.keys()))
print(len(a.values()))

In [None]:
a = {'a' : 10, 'b' : 20, 'c' : 30, 'd' : 40}
b = {'a' : [10, 20, 30]}

for key, value in a.items():
    for keys in b.keys():
        if key == keys:
            for i in range(len(b[keys])):
                if value == values[i]:
                    print(value)