# Kaggle Competition Notebook

### Import necessary libraries

In [1]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud

### Prep data for N Gram Model

In [2]:
def prepare_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename) as file:
        for line in file:
            line = re.sub(' +',' ',line)
            line = re.sub('[,\."“”]','',line)
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    
    return train_dat

In [35]:
#train_data = prepare_data('train.txt',3)

In [3]:
#train_data[0:10]

In [4]:
def train_model(train_data, N):
    model = {}
    for line in train_data:
        for each_N in range(1,N+1):
            #for each line, generate all ngrams
            for index in range(0,len(line)-each_N):
                ngram = line[index]
                for n_forward in range(1,each_N):
                    ngram += ' ' + line[index+n_forward]
                if ngram in model:
                    model[ngram] += 1
                else:
                    model[ngram] = 1
    return model            

In [5]:
#model = train_model(train_data,3)

### Import Training data and examine

In [6]:
def prepare_test_data(filename, N):
    choices=[]
    reg_exp_str = '\{(.*)\|(.*)\}'
    pre_string = ''
    post_string = ''
    for gram in range(1,N):
        pre_string += '<s> '
        post_string += ' </s>'
        if gram % 2 == 0:
            reg_exp_str = reg_exp_str + ' ([^ ]*)'
        else:
            reg_exp_str = '([^ ]*) ' + reg_exp_str
    reg_exp = re.compile(reg_exp_str)
    with open(filename) as file:
        for row in file:
            row = re.sub(' +',' ',row)
            row = re.sub('[,\."“”]','',row)
            row = pre_string + row + post_string
            #extract options
            match = reg_exp.search(row)
            if match:
            #THIS NEEDS TO BE FIXED
                midpoint = math.ceil(N/2)
                choice_1 = match.group(midpoint)
                choice_2 = match.group(midpoint+1)
                for match_group in range(1,N+2):
                    if match_group < midpoint:
                        choice_1 = match.group(match_group) + ' ' + choice_1
                        choice_2 = match.group(match_group) + ' ' + choice_2
                    elif match_group > midpoint+1:
                        choice_1 += ' ' + match.group(match_group)
                        choice_2 += ' ' + match.group(match_group)
                choice = (choice_1, choice_2)
            else:
                print('error, no value')
                choice = 'error'
            choices.append(choice)
    return choices

In [41]:
choices = prepare_test_data('test.txt',3)
choices

[('seo a dhéanann', 'seo á dhéanann'),
 ('Timpeallachta a dhéanamh', 'Timpeallachta á dhéanamh'),
 ('luath a dúirt', 'luath á dúirt'),
 ('seandálaíochta a dhéanamh', 'seandálaíochta á dhéanamh'),
 ('difriúla a aithint', 'difriúla á aithint'),
 ('vóta a shaothraigh', 'vóta á shaothraigh'),
 ('is a cuirtear', 'is á cuirtear'),
 ('poiblí a bhí', 'poiblí á bhí'),
 ('gearán a thabhairt', 'gearán á thabhairt'),
 ('e a chur', 'e á chur'),
 ('Chontae a threisiú\n', 'Chontae á threisiú\n'),
 ('Veda a bheidh', 'Veda á bheidh'),
 ('dualgais a shanntar', 'dualgais á shanntar'),
 ('deabhal a deir', 'deabhal á deir'),
 ('deas a tabhairt', 'deas á tabhairt'),
 ('seo a leanas', 'seo á leanas'),
 ("1883 a d'fhoilsigh", "1883 á d'fhoilsigh"),
 ('atá a dhéanamh', 'atá á dhéanamh'),
 ('Tuairisceoir a bronnadh', 'Tuairisceoir á bronnadh'),
 ('fhorbairt a dhéanamh', 'fhorbairt á dhéanamh'),
 ('dom a chur', 'dom á chur'),
 ('t-urlár a ghlanadh', 't-urlár á ghlanadh'),
 ('olltoghcháin a thuar', 'olltoghcháin 

In [56]:
def basic_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
    
    #if abs(count_1 - count_2) < 2:
    #if count_1 < 2 and count_2 < 2:
    if count_1 == 0 and count_2 == 0:
        split_1 = option_1.split()
        split_2 = option_2.split()
        if len(split_1) > 1 and len(split_2) > 1:
            if len(split_1) % 2 ==0:
                split_1.pop(0)
                split_2.pop(0)
            else:
                split_1.pop()
                split_2.pop()
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            return basic_probability(new_option_1,new_option_2,model)
            
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    if prob_1 < 0.47:
        prob_1 = 0.01
    if prob_1 > 0.53:
        prob_1 = 0.99
    return prob_1, prob_2

In [47]:
def interpolate_probability(option_1,option_2,model):
    split_1 = option_1.split()
    split_2 = option_2.split()
    len_1 = len(split_1)
    len_2 = len(split_2)
    new_prob_1 = 0
    new_prob_2 = 0
    
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
    
    if len_1 > 1 and len_2 > 1:
            if len(split_1) % 2 ==0:
                split_1.pop(0)
                split_2.pop(0)
            else:
                split_1.pop()
                split_2.pop()
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            new_prob_1, new_prob_2 = interpolate_probability(new_option_1,new_option_2,model)
        
    if count_1 == 0 and count_2 ==0:
        count_1 = 1
        count_2 = 1
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    #if new_prob_1 != 0 and new_prob_2 != 0:
        #prob_1 = 0.5*prob_1 + 0.5*new_prob_1
        #prob_2 = 0.5*prob_2 + 0.5*new_prob_2
    
    return prob_1, prob_2

In [44]:
start = time.time()
prob = interpolate_probability(choices[0][0],choices[0][1],model)
end = time.time()
print(end-start, prob)

NameError: name 'model' is not defined

In [18]:
#expected_runtime = (end-start)*20000
#expected_runtime/60

### Try Bigram Model

In [54]:
def evaluate_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = basic_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [28]:
def evaluate_interp_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = interpolate_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [29]:
#results = evaluate_model(model,choices)

In [30]:
#results

In [49]:
def write_output(filename,results):
    out_file = open(filename,'w')
    count =0
    for line in results:
        output = str(line[0]) + "," + str(line[1]) + "\n"
        out_file.write(output)
    out_file.close()
    

In [50]:
def full_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_model(model,choices)
    write_output(output_name,results)

In [33]:
def full_interp_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_interp_model(model,choices)
    write_output(output_name,results)

In [57]:
full_pipeline("trigram_with_bold_reassignment.csv","train.txt","test.txt",3)

In [45]:
full_interp_pipeline("interpreted_trigram.csv","train.txt","test.txt",3)