# Kaggle Competition Notebook

### Import necessary libraries

In [10]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud

### Prep data for N Gram Model

In [11]:
def prepare_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename) as file:
        for line in file:
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    
    return train_dat

In [12]:
train_data = prepare_data('train.txt',3)

In [65]:
def train_model(train_data, N):
    model = {}
    for line in train_data:
        for each_N in range(1,N+1):
            #for each line, generate all ngrams
            for index in range(0,len(line)-each_N):
                ngram = line[index]
                for n_forward in range(1,each_N):
                    ngram += ' ' + line[index+n_forward]
                if ngram in model:
                    model[ngram] += 1
                else:
                    model[ngram] = 1
    return model            

In [66]:
model = train_model(train_data,2)

### Import Training data and examine

In [102]:
def prepare_test_data(filename, N):
    choices=[]
    reg_exp_str = '\{(.*)\|(.*)\}'
    pre_string = ''
    post_string = ''
    for gram in range(1,N):
        reg_exp_str = '([^ ]*) ' + reg_exp_str
        pre_string += '<s> '
        post_string += ' </s>'
    reg_exp = re.compile(reg_exp_str)
    with open(filename) as file:
        for row in file:
            row = re.sub(' +',' ',row)
            row = pre_string + row + post_string
            #extract options
            match = reg_exp.search(row)
            if match:
                choice = match.group(1)
                for match_group in range(2,N):
                    choice += ' ' + match.group(match_group) 
                choice = (choice + ' ' + match.group(N), choice + ' ' + match.group(N+1))
            else:
                print('error, no value')
                choice = 'error'
            choices.append(choice)
    return choices

In [103]:
choices = prepare_test_data('test.txt',2)
choices

[('seo a', 'seo á'),
 ('Timpeallachta a', 'Timpeallachta á'),
 ('luath, a', 'luath, á'),
 ('seandálaíochta a', 'seandálaíochta á'),
 ('difriúla a', 'difriúla á'),
 ('vóta a', 'vóta á'),
 ('is a', 'is á'),
 ('poiblí a', 'poiblí á'),
 ('gearán a', 'gearán á'),
 ('e a', 'e á'),
 ('Chontae a', 'Chontae á'),
 ('Veda a', 'Veda á'),
 ('dualgais a', 'dualgais á'),
 ('deabhal," a', 'deabhal," á'),
 ('deas a', 'deas á'),
 ('seo a', 'seo á'),
 ('1883 a', '1883 á'),
 ('atá a', 'atá á'),
 ('Tuairisceoir a', 'Tuairisceoir á'),
 ('fhorbairt a', 'fhorbairt á'),
 ('dom a', 'dom á'),
 ('t-urlár a', 't-urlár á'),
 ('olltoghcháin a', 'olltoghcháin á'),
 ('Níl a', 'Níl á'),
 ('níl a', 'níl á'),
 ('seo a', 'seo á'),
 ('post a', 'post á'),
 ('chaoi a', 'chaoi á'),
 ("ea?' a", "ea?' á"),
 ('scoileanna a', 'scoileanna á'),
 ('seo a', 'seo á'),
 ('seo a', 'seo á'),
 ('fearr a', 'fearr á'),
 ('measúnaithe a', 'measúnaithe á'),
 ('tromchúiseacha a', 'tromchúiseacha á'),
 ('Gaeilge a', 'Gaeilge á'),
 ('fear a', 'f

In [109]:
def basic_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
        
    if count_1 == 0 and count_2 == 0:
        split_1 = option_1.split()
        split_2 = option_2.split()
        if len(split_1) > 1 and len(split_2) > 1:
            split_1.pop(0)
            split_2.pop(0)
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            return basic_probability(new_option_1,new_option_2,model)
            
    elif count_1 == 0:
        count_1 = 1
    elif count_2 == 0:
        count_2 = 1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    return prob_1, prob_2

In [110]:
start = time.time()
prob = basic_probability(choices[0][0],choices[0][1],model)
end = time.time()
print(end-start, prob)

9.1552734375e-05 (0.9551042710135689, 0.044895728986431106)


In [111]:
expected_runtime = (end-start)*20000
expected_runtime/60

0.030517578125

### Try Bigram Model

In [112]:
def evaluate_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = basic_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [113]:
results = evaluate_model(model,choices)

In [114]:
results

[['Id', 'Expected'],
 [1, 0.9551042710135689],
 [2, 0.5],
 [3, 0.875],
 [4, 0.9393939393939394],
 [5, 0.9545454545454546],
 [6, 0.9955156950672646],
 [7, 0.9814333457111029],
 [8, 0.9560439560439561],
 [9, 0.9671232876712329],
 [10, 0.9770114942528736],
 [11, 0.9833333333333333],
 [12, 0.9667296380930382],
 [13, 0.9852941176470589],
 [14, 0.9667296380930382],
 [15, 0.9883720930232558],
 [16, 0.9551042710135689],
 [17, 0.6666666666666666],
 [18, 0.12635692574902302],
 [19, 0.9667296380930382],
 [20, 0.9894736842105263],
 [21, 0.9938837920489296],
 [22, 0.8333333333333334],
 [23, 0.75],
 [24, 0.9818181818181818],
 [25, 0.9968847352024922],
 [26, 0.9551042710135689],
 [27, 0.9767441860465116],
 [28, 0.9982905982905983],
 [29, 0.5],
 [30, 0.9798387096774194],
 [31, 0.9551042710135689],
 [32, 0.9551042710135689],
 [33, 0.9899569583931134],
 [34, 0.9705882352941176],
 [35, 0.9166666666666666],
 [36, 0.9355378819589787],
 [37, 0.9848066298342542],
 [38, 0.9965753424657534],
 [39, 0.8235294117

In [115]:
def write_output(filename,results):
    out_file = open(filename,'w')
    count =0
    for line in results:
        output = str(line[0]) + "," + str(line[1]) + "\n"
        out_file.write(output)
    out_file.close()
    

In [116]:
def full_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_model(model,choices)
    write_output(output_name,results)
    
    

In [117]:
full_pipeline("trigram_model_adaptive.csv","train.txt","test.txt",3)