In [8]:
### This program is a very simple lemmatizer, which learns a
### lemmatization function from an annotated corpus. The function is
### so basic I wouldn't even consider it machine learning: it's
### basically just a big lookup table, which maps every word form
### attested in the training data to the most common lemma associated
### with that form. At test time, the program checks if a form is in
### the lookup table, and if so, it gives the associated lemma; if the
### form is not in the lookup table, it gives the form itself as the
### lemma (identity mapping).

### The program performs training and testing in one run: it reads the
### training data, learns the lookup table and keeps it in memory,
### then reads the test data, runs the testing, and reports the
### results.

### The program takes two command line arguments, which are the paths
### to the training and test files. Both files are assumed to be
### already tokenized, in Universal Dependencies format, that is: each
### token on a separate line, each line consisting of fields separated
### by tab characters, with word form in the second field, and lemma
### in the third field. Tab characters are assumed to occur only in
### lines corresponding to tokens; other lines are ignored.

import operator
import sys
import re

### Global variables

# Paths for data are read from command line
#train_file = sys.argv[1]
#test_file = sys.argv[2]
train_file = './data/hi_hdtb-ud-train.conllu'
test_file = './data/hi_hdtb-ud-test.conllu'

# Counters for lemmas in the training data: word form -> lemma -> count
lemma_count = {}

# Lookup table learned from the training data: word form -> lemma
lemma_max = {}

# Varialbe for counting total tokens
wordform_tokens = 0
unambiguous_types, unambiguous_tokens = 0, 0
ambiguous_types, ambiguous_tokens = 0, 0
ambiguous_most_common_tokens = 0
identity_tokens = 0

# Variables for reporting results
training_stats = ['Wordform types' , 'Wordform tokens' , 'Unambiguous types' , 'Unambiguous tokens' , 'Ambiguous types' , 'Ambiguous tokens' , 'Ambiguous most common tokens' , 'Identity tokens']
training_counts = dict.fromkeys(training_stats , 0)

test_outcomes = ['Total test items' , 'Found in lookup table' , 'Lookup match' , 'Lookup mismatch' , 'Not found in lookup table' , 'Identity match' , 'Identity mismatch']
test_counts = dict.fromkeys(test_outcomes , 0)

accuracies = {}

### Training: read training data and populate lemma counters

train_data = open (train_file , 'r')

for line in train_data:
    
    # Tab character identifies lines containing tokens
    if re.search ('\t' , line):

        # Tokens represented as tab-separated fields
        field = line.strip().split('\t')

        # Word form in second field, lemma in third field
        form = field[1]
        lemma = field[2]
        wordform_tokens += 1
        
        ######################################################
        ### Insert code for populating the lemma counts  (coded below)  ###
        ######################################################
        
        # Add unseen form to nested Dict.
        if form not in lemma_count:
            lemma_count[form] = {lemma : 1}
        else:
            # If form and lemma already exist in Dict.
            if lemma in lemma_count[form]:
                # Add count
                lemma_count[form][lemma] = lemma_count[form][lemma] + 1
            # If a form exist but a lemma don't
            else:
                lemma_count[form][lemma] = 1

#print(dict(list(lemma_count.items())[0:10]))

### Model building and training statistics

#for form in lemma_count.keys():

# Get key, value from lemma_count Dict. to each variable keys and values
for keys, values in lemma_count.items():
    
    ######################################################
    ### Insert code for building the lookup table (coded below)      ###
    ######################################################
    
    # From values get max counted value and add it into new Dict. in form of {key : value}
    lemma_max[keys] = max(lemma_count[keys].items(), key = operator.itemgetter(1))[0]
    
    ######################################################
    ### Insert code for populating the training counts (coded below)###
    ######################################################
    
    tokens = values.keys()
    
    if len(values) == 1:
        unambiguous_types += 1
        
        for token in tokens:
            unambiguous_tokens += lemma_count[keys][token]
            
            if keys == token:
                identity_tokens += lemma_count[keys][token]
    else:
        ambiguous_types += 1
        
        for token in tokens:
            ambiguous_tokens += lemma_count[keys][token]
            if token == lemma_max[keys]:
                ambiguous_most_common_tokens += lemma_count[keys][token]
            
            if keys == token:
                identity_tokens += lemma_count[keys][token]
                

# print(dict(list(lemma_max.items())[0:10]))


# Fill out the data in training_counts Dict.
training_counts['Wordform types'] = len(lemma_count)
training_counts['Wordform tokens'] = wordform_tokens
training_counts['Unambiguous types'] = unambiguous_types
training_counts['Unambiguous tokens'] = unambiguous_tokens
training_counts['Ambiguous types'] = ambiguous_types
training_counts['Ambiguous tokens'] = ambiguous_tokens
training_counts['Ambiguous most common tokens'] = ambiguous_most_common_tokens
training_counts['Identity tokens'] = identity_tokens

accuracies['Expected lookup'] = float((unambiguous_tokens + ambiguous_most_common_tokens) / wordform_tokens)
accuracies['Expected identity'] = float(identity_tokens / wordform_tokens)

### Testing: read test data, and compare lemmatizer output to actual lemma
    
test_data = open (test_file , 'r')
test_tokens, lookup_tokens, identity_tokens = 0, 0, 0
lookup_match, lookup_mismatch, identity_match, identity_mismatch = 0, 0, 0, 0

for line in test_data:

    # Tab character identifies lines containing tokens
    if re.search ('\t' , line):

        # Tokens represented as tab-separated fields
        field = line.strip().split('\t')

        # Word form in second field, lemma in third field
        form = field[1]
        lemma = field[2]
        test_tokens += 1
        
        ######################################################
        ### Insert code for populating the test counts     ###
        ######################################################
        
        # Check if test form is in lookup table
        if form in lemma_max:
            # Check if test lemma matches the one in lookup table
            if lemma == lemma_max[form]:
                # Add 1 to lookup_match 
                lookup_match += 1
            else:
                # Add 1 to lookup_mismatch 
                lookup_mismatch += 1
        
        # Check if form is out of vocabulary
        else:
            # Check if it is identify matching
            if form == lemma:
                # Add 1 to identity_match
                identity_match += 1
            else:
                # Add 1 to identity_mismatch
                identity_mismatch += 1

test_counts['Total test items'] = test_tokens
test_counts['Found in lookup table'] = lookup_match + lookup_mismatch
test_counts['Lookup match'] = lookup_match
test_counts['Lookup mismatch'] = lookup_mismatch
test_counts['Not found in lookup table'] = identity_match + identity_mismatch
test_counts['Identity match'] = identity_match
test_counts['Identity mismatch'] = identity_mismatch

accuracies['Lookup'] = float(lookup_match / (lookup_match + lookup_mismatch))
accuracies['Identity'] = float(identity_match / (identity_match + identity_mismatch))
accuracies['Overall'] = float((lookup_match + identity_match) / (lookup_match + lookup_mismatch + identity_match + identity_mismatch))

### Report training statistics and test results
                
output = open ('lookup-output.txt' , 'w')

output.write ('Training statistics\n')

for stat in training_stats:
    output.write (stat + ': ' + str(training_counts[stat]) + '\n')

for model in ['Expected lookup' , 'Expected identity']:
    output.write (model + ' accuracy: ' + str(accuracies[model]) + '\n')

output.write ('Test results\n')
 
for outcome in test_outcomes:
    output.write (outcome + ': ' + str(test_counts[outcome]) + '\n')

for model in ['Lookup' , 'Identity' , 'Overall']:
    output.write (model + ' accuracy: ' + str(accuracies[model]) + '\n')

output.close

print(training_counts)
print(test_counts)

for model in ['Expected lookup' , 'Expected identity']:
    print(model + ' accuracy: ' + str(accuracies[model]))
for model in ['Lookup' , 'Identity' , 'Overall']:
    print(model + ' accuracy: ' + str(accuracies[model]))

{'Wordform types': 16879, 'Wordform tokens': 281057, 'Unambiguous types': 16465, 'Unambiguous tokens': 196204, 'Ambiguous types': 414, 'Ambiguous tokens': 84853, 'Ambiguous most common tokens': 75667, 'Identity tokens': 201485}
{'Total test items': 35430, 'Found in lookup table': 35430, 'Lookup match': 33855, 'Lookup mismatch': 1575, 'Not found in lookup table': 0, 'Identity match': 1227, 'Identity mismatch': 354}
Expected lookup accuracy: 0.9673162383431119
Expected identity accuracy: 0.7168830521922599
Lookup accuracy: 0.9639280333244704
Identity accuracy: 0.7760910815939279
Overall accuracy: 0.9555461473327689


In [None]:
print(training_counts)

In [None]:
ages = 0
#people = {1: {'name': 'John', 'age': 27, 'sex': 'Male'},
#          2: {'name': 'Marie', 'age': 22, 'sex': 'Female'},
#          3: {'name': 'Luna', 'age': 24, 'sex': 'Female', 'married': 'No'}}
people = {1: {'name': 22, 'age': 27, 'sex': 21}}

for ppls, infos in people.items():
    keys = infos.keys()
    for key in keys:
        ages += people[ppls][key]
        print(people[ppls][key])

print(ages)

In [None]:
people = {'John': {'name': 'John'},
          'James': {'name': 'Marie', 'temp': 'temporary'},
          'Smith': {'name': 'Luna'}}

for ppls, infos in people.items():
    if len(infos) == 1:
        print(people[ppls])
        print('len = 1')

In [None]:
people = {1: {'name': 'John', 'age': '27', 'sex': 'Male'},
          2: {'name': 'Marie', 'age': '22', 'sex': 'Female'},
          3: {'name': 'Luna', 'age': '24', 'sex': 'Female', 'married': 'No'}}

for i in range(1, 3):
    if i in people:
        print(people[i])

In [None]:
people = {'John': {'name': 'John'},
          'James': {'name': 'Marie', 'temp': 'temporary'},
          'Smith': {'name': 'Luna'}}

if 'Smith' in people:
    print('Smith')