# Kaggle Competition Notebook

### Import necessary libraries

In [33]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud

### Prep data for N Gram Model

In [34]:
def prepare_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename) as file:
        for line in file:
            line = re.sub(' +',' ',line)
            line = re.sub('[,\."“”]','',line)
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    
    return train_dat

In [35]:
train_data = prepare_data('train.txt',3)

In [36]:
train_data[0:10]

[['<s>',
  '<s>',
  'Nuair',
  'atá',
  'cúiseanna',
  'á',
  'gcur',
  'ar',
  'an',
  'taifead',
  'poiblí',
  'nó',
  'le',
  'linn',
  'ráiteas',
  'poiblí',
  'a',
  'dhéanamh',
  'ní',
  'mór',
  'don',
  'ionchúisitheoir',
  'cúram',
  'a',
  'ghlacadh',
  'gan',
  'náire',
  'a',
  'chur',
  'ar',
  'an',
  'gcúisí',
  'ná',
  'ar',
  'fhinnéithe',
  'trí',
  'eolas',
  'a',
  'nochtadh',
  'nach',
  'gcuirfear',
  'ar',
  'fáil',
  'go',
  'poiblí',
  'murach',
  'sin',
  '</s>',
  '</s>'],
 ['<s>',
  '<s>',
  'Tá',
  'cinneadh',
  'déanta',
  'ag',
  'Killoran',
  'agus',
  'ag',
  'a',
  'comhghleacaithe',
  'an',
  'bhéim',
  'a',
  'chur',
  'ar',
  'an',
  'drámaíocht',
  'agus',
  'airgead',
  'Chiste',
  'Craoltóireachta',
  'na',
  'Gaeilge',
  'á',
  'dháileadh',
  '</s>',
  '</s>'],
 ['<s>',
  '<s>',
  'Thug',
  'Luiz',
  'Henrique',
  'Uaquim',
  'uachtarán',
  'chomharchumann',
  'fheirmeoirí',
  'an',
  'réigiúin',
  'bithiúnaigh',
  'gléasta',
  'suas',
  'mar',


In [37]:
def train_model(train_data, N):
    model = {}
    for line in train_data:
        for each_N in range(1,N+1):
            #for each line, generate all ngrams
            for index in range(0,len(line)-each_N):
                ngram = line[index]
                for n_forward in range(1,each_N):
                    ngram += ' ' + line[index+n_forward]
                if ngram in model:
                    model[ngram] += 1
                else:
                    model[ngram] = 1
    return model            

In [38]:
model = train_model(train_data,3)

### Import Training data and examine

In [39]:
def prepare_test_data(filename, N):
    choices=[]
    reg_exp_str = '\{(.*)\|(.*)\}'
    pre_string = ''
    post_string = ''
    for gram in range(1,N):
        pre_string += '<s> '
        post_string += ' </s>'
        if gram % 2 == 0:
            reg_exp_str = reg_exp_str + ' ([^ ]*)'
        else:
            reg_exp_str = '([^ ]*) ' + reg_exp_str
    reg_exp = re.compile(reg_exp_str)
    with open(filename) as file:
        for row in file:
            row = re.sub(' +',' ',row)
            row = re.sub('[,\."“”]','',row)
            row = pre_string + row + post_string
            #extract options
            match = reg_exp.search(row)
            if match:
            #THIS NEEDS TO BE FIXED
                midpoint = math.ceil(N/2)
                choice_1 = match.group(midpoint)
                choice_2 = match.group(midpoint+1)
                for match_group in range(1,N+2):
                    if match_group < midpoint:
                        choice_1 = match.group(match_group) + ' ' + choice_1
                        choice_2 = match.group(match_group) + ' ' + choice_2
                    elif match_group > midpoint+1:
                        choice_1 += ' ' + match.group(match_group)
                        choice_2 += ' ' + match.group(match_group)
                choice = (choice_1, choice_2)
            else:
                print('error, no value')
                choice = 'error'
            choices.append(choice)
    return choices

In [40]:
choices = prepare_test_data('test.txt',3)
choices

[('seo a dhéanann', 'seo á dhéanann'),
 ('Timpeallachta a dhéanamh', 'Timpeallachta á dhéanamh'),
 ('luath a dúirt', 'luath á dúirt'),
 ('seandálaíochta a dhéanamh', 'seandálaíochta á dhéanamh'),
 ('difriúla a aithint', 'difriúla á aithint'),
 ('vóta a shaothraigh', 'vóta á shaothraigh'),
 ('is a cuirtear', 'is á cuirtear'),
 ('poiblí a bhí', 'poiblí á bhí'),
 ('gearán a thabhairt', 'gearán á thabhairt'),
 ('e a chur', 'e á chur'),
 ('Chontae a threisiú\n', 'Chontae á threisiú\n'),
 ('Veda a bheidh', 'Veda á bheidh'),
 ('dualgais a shanntar', 'dualgais á shanntar'),
 ('deabhal a deir', 'deabhal á deir'),
 ('deas a tabhairt', 'deas á tabhairt'),
 ('seo a leanas', 'seo á leanas'),
 ("1883 a d'fhoilsigh", "1883 á d'fhoilsigh"),
 ('atá a dhéanamh', 'atá á dhéanamh'),
 ('Tuairisceoir a bronnadh', 'Tuairisceoir á bronnadh'),
 ('fhorbairt a dhéanamh', 'fhorbairt á dhéanamh'),
 ('dom a chur', 'dom á chur'),
 ('t-urlár a ghlanadh', 't-urlár á ghlanadh'),
 ('olltoghcháin a thuar', 'olltoghcháin 

In [63]:
def basic_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
        
    #if count_1 < 4 and count_2 < 4:
    if count_1 == 0 and count_2 == 0:
        split_1 = option_1.split()
        split_2 = option_2.split()
        if len(split_1) > 1 and len(split_2) > 1:
            if len(split_1) % 2 ==0:
                split_1.pop(0)
                split_2.pop(0)
            else:
                split_1.pop()
                split_2.pop()
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            return basic_probability(new_option_1,new_option_2,model)
            
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    return prob_1, prob_2

In [64]:
start = time.time()
prob = basic_probability(choices[0][0],choices[0][1],model)
end = time.time()
print(end-start, prob)

8.988380432128906e-05 (0.9937888198757763, 0.006211180124223602)


In [65]:
expected_runtime = (end-start)*20000
expected_runtime/60

0.029961268107096355

### Try Bigram Model

In [66]:
def evaluate_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = basic_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [67]:
results = evaluate_model(model,choices)

In [68]:
results

[['Id', 'Expected'],
 [1, 0.9937888198757763],
 [2, 0.9090909090909091],
 [3, 0.9090909090909091],
 [4, 0.9428571428571428],
 [5, 0.9552238805970149],
 [6, 0.9995497523638002],
 [7, 0.9090909090909091],
 [8, 0.9958506224066389],
 [9, 0.989010989010989],
 [10, 0.9090909090909091],
 [11, 0.9838709677419355],
 [12, 0.9667087094513829],
 [13, 0.9090909090909091],
 [14, 0.9667087094513829],
 [15, 0.9888888888888889],
 [16, 0.9999549772635181],
 [17, 0.9756097560975611],
 [18, 0.05063291139240506],
 [19, 0.9667087094513829],
 [20, 0.9941520467836257],
 [21, 0.9803921568627452],
 [22, 0.9803921568627452],
 [23, 0.9677419354838709],
 [24, 0.999674373168349],
 [25, 0.9923664122137404],
 [26, 0.4444444444444444],
 [27, 0.9768518518518519],
 [28, 0.9677419354838709],
 [29, 0.9090909090909091],
 [30, 0.9941520467836257],
 [31, 0.9999549772635181],
 [32, 0.9032258064516129],
 [33, 0.9523809523809523],
 [34, 0.9973753280839894],
 [35, 0.9923664122137404],
 [36, 0.9837133550488599],
 [37, 0.999237223

In [59]:
def write_output(filename,results):
    out_file = open(filename,'w')
    count =0
    for line in results:
        output = str(line[0]) + "," + str(line[1]) + "\n"
        out_file.write(output)
    out_file.close()
    

In [60]:
def full_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_model(model,choices)
    write_output(output_name,results)
    
    

In [69]:
full_pipeline("a_very_bold_trigram.csv","train.txt","test.txt",3)