# Kaggle Competition Development Notebook

### Import necessary libraries

In [107]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud
import csv
import random
from sklearn.metrics import log_loss

# Import the vocab file

In [208]:
def create_vocab():
    with open('vocab.csv') as file:
        reader = csv.reader(file)
        vocab = list(reader)
    return vocab

### Prep training data for N Gram Model

In [84]:
def prepare_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename, encoding="utf8") as file:
        for line in file:
            line = ud.normalize("NFC",line)
            line = re.sub(' +',' ',line)
            line = re.sub('[,\.\?"“”]','',line)
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    
    return train_dat

## Create a test example from a training example

In [209]:
def create_test_example(training_line,word):
    if " " + word[0] + " " in training_line:
        new_str = ' {'+word[0]+"|"+word[1] + '} '
        test_line = re.sub(" " + word[0] + " ",new_str, training_line,1)
        test_line = re.sub("<s>","",test_line)
        test_line = re.sub("</s>","",test_line)
        prob = 1
        return test_line.strip(),prob
    elif " " + word[1] + " " in training_line:
        new_str = ' {'+word[0]+"|"+word[1] + '} '
        test_line = re.sub(" " + word[1] + " ",new_str, training_line,1)
        test_line = re.sub("<s>","",test_line)
        test_line = re.sub("</s>","",test_line)
        prob = 0
        return test_line.strip(),prob
    else:
        print("Error, word not found")

## Determine which vocab word corresponds to location in the training data

In [210]:
def which_vocab(index):
    if index < 24200:
        return 0, 24200
    elif index < 48400:
        return 1, 24200
    elif index < 51290:
        return 2, 2890
    elif index < 75490:
        return 3, 24200
    elif index < 99690:
        return 4, 24200
    elif index < 123890:
        return 5, 24200
    elif index < 131159:
        return 6, 7269
    elif index < 155359:
        return 7, 24200
    elif index < 179559:
        return 8, 24200
    elif index < 203759:
        return 9, 24200
    elif index < 227959:
        return 10, 24200
    elif index < 252159:
        return 11, 24200
    elif index < 258227:
        return 12, 6068
    elif index < 282427:
        return 13, 24200
    elif index < 306627:
        return 14, 24200
    elif index < 310023:
        return 15, 3396
    elif index < 334223:
        return 16, 24200
    elif index < 358423:
        return 17, 24200
    elif index < 382623:
        return 18, 24200
    elif index < 406823:
        return 19, 24200
    elif index < 418928:
        return 20, 12105
    elif index < 430425:
        return 21, 11497
    elif index < 446988:
        return 22, 16563
    elif index < 452037:
        return 23, 5049
    elif index < 456571:
        return 24, 4534

## Split the data into training and development
#### Dev is saved to file, along with dev answers, new training array is returned

In [211]:
def train_dev_split(train_array, percent_split, vocab_df, dev_filename, answers_filename):
    dev_file = open(dev_filename,'w',encoding='utf8')
    dev_answers = open(answers_filename,'w',encoding='utf8')
    dev_answers.write("Id,Expected\n")
    new_train = []

    count = 1
    vocab_word = 0
    for index,training_line in enumerate(train_array):
        vocab_word, vocab_count = which_vocab(index)  
        if random.random() < percent_split/100:
            test_line, prob = create_test_example(" ".join(training_line),vocab_df[vocab_word])
            dev_answers.write(str(count) + "," + str(prob) + "\n")
            dev_file.write(test_line + "\n")
            count += 1
        else:
            new_train.append(training_line)
        
    dev_file.close()
    dev_answers.close()
    
    return new_train

### Actually create the dictionary with all n-grams

In [212]:
def train_model(train_data, N):
    model = {}
    for line in train_data:
        for each_N in range(1,N+1):
            #for each line, generate all ngrams
            for index in range(0,len(line)-each_N):
                ngram = line[index]
                for n_forward in range(1,each_N):
                    ngram += ' ' + line[index+n_forward]
                if ngram in model:
                    model[ngram] += 1
                else:
                    model[ngram] = 1
    return model            

### Extract N-gram choices from the test data

In [213]:
def prepare_test_data(filename, N):
    choices=[]
    reg_exp_str = '\{(.*)\|(.*)\}'
    pre_string = ''
    post_string = ''
    for gram in range(1,N):
        pre_string += '<s> '
        post_string += ' </s>'
        if gram % 2 == 0:
            reg_exp_str = reg_exp_str + ' ([^ ]*)'
        else:
            reg_exp_str = '([^ ]*) ' + reg_exp_str
    
    reg_exp = re.compile(reg_exp_str)
    with open(filename, encoding="utf8") as file:
        for row in file:
            row = ud.normalize("NFC",row)
            row = re.sub(' +',' ',row)
            row = re.sub('[,\.\?"“”]','',row)
            row = pre_string + row.strip() + post_string
            #extract options
            match = reg_exp.search(row)
            if match:
                midpoint = math.ceil(N/2)
                choice_1 = match.group(midpoint)
                choice_2 = match.group(midpoint+1)
                for match_group in range(1,N+2):
                    if match_group < midpoint:
                        choice_1 = match.group(match_group) + ' ' + choice_1
                        choice_2 = match.group(match_group) + ' ' + choice_2
                    elif match_group > midpoint+1:
                        choice_1 += ' ' + match.group(match_group)
                        choice_2 += ' ' + match.group(match_group)
                choice = (choice_1, choice_2)
            else:
                print('error, no value')
                choice = 'error'
            choices.append(choice)
    return choices

## Probability function to determine likelihood of option_1 and option_2

In [215]:
def backoff_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
    
    #if abs(count_1 - count_2) < 2:
    #if count_1 < 2 and count_2 < 2:
    if count_1 == 0 and count_2 == 0:
        split_1 = option_1.split()
        split_2 = option_2.split()
        if len(split_1) > 1 and len(split_2) > 1:
            if len(split_1) % 2 ==0:
                split_1.pop(0)
                split_2.pop(0)
            else:
                split_1.pop()
                split_2.pop()
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            return basic_probability(new_option_1,new_option_2,model)
            
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    return prob_1, prob_2

In [216]:
def basic_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
        
    if count_1 == 0 and count_2 == 0:
        count_1 = 1
        count_2 = 1    
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    return prob_1, prob_2

In [259]:
def interpolate_probability(option_1,option_2,model,weights):
    prob_1_arr=[]
    prob_2_arr=[]
    n_option_1 = option_1
    n_option_2 = option_2
    
    while True:
        # get basic probabilty and add to overall
        c_prob_1, c_prob_2 = basic_probability(n_option_1,n_option_2,model)
        prob_1_arr.append(c_prob_1)
        prob_2_arr.append(c_prob_2)
        
        # split the options
        split_1 = n_option_1.split()
        split_2 = n_option_2.split()
        
        #get option lengths
        len_1 = len(split_1)
        len_2 = len(split_2)
        
        if len_1 <= 1 or len_2 <= 1:
            break
        
        if len(split_1) % 2 ==0:
            split_1.pop(0)
            split_2.pop(0)
        else:
            split_1.pop()
            split_2.pop()
        n_option_1 = " ".join(split_1)
        n_option_2 = " ".join(split_2)   
    
    #multiply probabilities by weights
    prob_1 = np.dot(weights,prob_1_arr)
    prob_2 = np.dot(weights,prob_2_arr)
    
    return prob_1, prob_2

In [271]:
start = time.time()
prob = interpolate_probability(choices[1][0],choices[1][1],model,[1,0,0])
end = time.time()
print(end-start, prob)

0.00043773651123046875 (0.5, 0.5)


In [268]:
choices

[('ag a comhghleacaithe', 'ag á comhghleacaithe'),
 ('Achta a thabhairt', 'Achta á thabhairt'),
 ('Idirnáisiúnta a chomhlíonadh', 'Idirnáisiúnta á chomhlíonadh'),
 ('Cé a sholáthródh', 'Cé á sholáthródh'),
 ('seo a dhaingniú', 'seo á dhaingniú'),
 ('agat a chur', 'agat á chur'),
 ('airgeadais a thabhairt', 'airgeadais á thabhairt'),
 ('rang a trí', 'rang á trí'),
 ('ball a dhéanann', 'ball á dhéanann'),
 ('aghaidh a thabhairt', 'aghaidh á thabhairt'),
 ('aitheasc a thabhairt', 'aitheasc á thabhairt'),
 ('nuair a bhí', 'nuair á bhí'),
 ('éard a bhí', 'éard á bhí'),
 ('euro a ghearradh', 'euro á ghearradh'),
 ('scoláiriúil a thiomsaigh', 'scoláiriúil á thiomsaigh'),
 ('cuairt a thabhairt', 'cuairt á thabhairt'),
 ('agus a bhfuil', 'agus á bhfuil'),
 ('fáth a mbíonn', 'fáth á mbíonn'),
 ('speisialtóra a úsáid', 'speisialtóra á úsáid'),
 ('athbhreithniú a dhéanamh', 'athbhreithniú á dhéanamh'),
 ('haird a thabhairt', 'haird á thabhairt'),
 ('é a bheith', 'é á bheith'),
 ('nuachta a thaispe

In [18]:
#expected_runtime = (end-start)*20000
#expected_runtime/60

### Try Bigram Model

In [99]:
def evaluate_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = basic_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [221]:
def evaluate_interp_model(model, choices, weights):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = interpolate_probability(choice[0],choice[1],model, weights)
        results.append([index+1,c1])
    return results

In [101]:
results = evaluate_model(model,choices)

In [103]:
#results

In [200]:
def write_output(filename,results):
    out_file = open(filename,'w')
    count =0
    for line in results:
        output = str(line[0]) + "," + str(line[1]) + "\n"
        out_file.write(output)
    out_file.close()
    return 1
    

In [183]:
def evaluate_results(prediction_file,actual_file):
    with open(prediction_file) as file:
        reader = csv.reader(file)
        predictions = list(reader)
    
    with open(actual_file) as file:
        reader = csv.reader(file)
        actual = list(reader)
    
    if len(actual) != len(predictions):
        print("Error: Files not the same length")
        return
    
    actual = np.array(actual)
    actual = actual[1:,1].astype(np.float64)
    predictions = np.array(predictions)
    predictions = predictions[1:,1].astype(np.float64)
    
    log_loss_score = log_loss(actual, predictions, eps=1e-15)
        
    return log_loss_score


In [199]:
evaluate_results("First_Dev_Test.csv","dev_answers.csv")

0.8597526024197777

In [105]:
def full_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_model(model,choices)
    write_output(output_name,results)

In [222]:
def full_interp_pipeline(output_name,train,test,N,weights):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_interp_model(model,choices,weights)
    write_output(output_name,results)

In [57]:
#full_pipeline("trigram_with_bold_reassignment.csv","train.txt","test.txt",3)

In [45]:
#full_interp_pipeline("interpreted_trigram.csv","train.txt","test.txt",3)

In [223]:
def full_dev_pipeline(output_name,train,dev_test,dev_answers,N,percent_split,weights):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    model = train_model(new_train,N)
    choices = prepare_test_data(dev_test,N)
    results = evaluate_interp_model(model,choices,weights)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [226]:
score = full_dev_pipeline("First_Dev_Test.csv","train.txt","dev_test.txt","dev_answers.csv",3,10,[0.2,0.7,0.1])
score

0.29523988463921635

In [None]:
for x in np.arange(0,1.1,0.1):
    if x == 1:
        weights = [x,0,0]
        print(weights,full_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",3,10,weights))
    for y in np.arange (0,1.05-x,0.1):
        z = 1-y-x
        weights = [x,y,z]
        print(weights,full_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",3,10,weights))

[0.0, 0.0, 1.0] 0.37767889720689996
[0.0, 0.1, 0.9] 0.3377886110497794
[0.0, 0.2, 0.8] 0.313699857891837
[0.0, 0.30000000000000004, 0.7] 0.29397317650376176
[0.0, 0.4, 0.6] 0.2744637773889263
[0.0, 0.5, 0.5] 0.2597998781991521
[0.0, 0.6000000000000001, 0.3999999999999999] 0.2471187742141428
[0.0, 0.7000000000000001, 0.29999999999999993] 0.23843004811530533
[0.0, 0.8, 0.19999999999999996] 0.22937582651983793
[0.0, 0.9, 0.09999999999999998] 0.22059384338933827
[0.0, 1.0, 0.0] 0.22163925290300135
[0.1, 0.0, 0.9] 0.3461718933359337
[0.1, 0.1, 0.8] 0.3219281964716464
[0.1, 0.2, 0.7000000000000001] 0.297256270254709
[0.1, 0.30000000000000004, 0.6] 0.2755222487922954
[0.1, 0.4, 0.5] 0.26069576110614917
[0.1, 0.5, 0.4] 0.24583992664291304
[0.1, 0.6000000000000001, 0.29999999999999993] 0.2334108156847355
[0.1, 0.7000000000000001, 0.19999999999999993] 0.22618134328552997
[0.1, 0.8, 0.09999999999999995] 0.2174597713244505
[0.1, 0.9, -2.7755575615628914e-17] 0.21202272670899094
[0.2, 0.0, 0.8] 0.3