# Kaggle Competition Development Notebook

### Import necessary libraries

In [2]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud
import csv
import random
from sklearn.metrics import log_loss

# Import the vocab file

In [3]:
def create_vocab():
    with open('vocab.csv') as file:
        reader = csv.reader(file)
        vocab = list(reader)
    return vocab

### Prep training data for N Gram Model

In [4]:
def prepare_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename, encoding="utf8") as file:
        for line in file:
            line = ud.normalize("NFC",line)
            line = re.sub('[–*:&()?@#!/,."“”]','',line)
            line = re.sub('\s+',' ',line)
            line = line.lower()
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    
    return train_dat

## Create a test example from a training example

In [5]:
def create_test_example(training_line,word):
    if " " + word[0] + " " in training_line:
        new_str = ' {'+word[0]+"|"+word[1] + '} '
        test_line = re.sub(" " + word[0] + " ",new_str, training_line,1)
        test_line = re.sub("<s>","",test_line)
        test_line = re.sub("</s>","",test_line)
        prob = 1
        return test_line.strip(),prob
    elif " " + word[1] + " " in training_line:
        new_str = ' {'+word[0]+"|"+word[1] + '} '
        test_line = re.sub(" " + word[1] + " ",new_str, training_line,1)
        test_line = re.sub("<s>","",test_line)
        test_line = re.sub("</s>","",test_line)
        prob = 0
        return test_line.strip(),prob
    else:
        print("Error, word not found")

## Determine which vocab word corresponds to location in the training data

In [6]:
def which_vocab(index):
    if index < 24200:
        return 0, 24200
    elif index < 48400:
        return 1, 24200
    elif index < 51290:
        return 2, 2890
    elif index < 75490:
        return 3, 24200
    elif index < 99690:
        return 4, 24200
    elif index < 123890:
        return 5, 24200
    elif index < 131159:
        return 6, 7269
    elif index < 155359:
        return 7, 24200
    elif index < 179559:
        return 8, 24200
    elif index < 203759:
        return 9, 24200
    elif index < 227959:
        return 10, 24200
    elif index < 252159:
        return 11, 24200
    elif index < 258227:
        return 12, 6068
    elif index < 282427:
        return 13, 24200
    elif index < 306627:
        return 14, 24200
    elif index < 310023:
        return 15, 3396
    elif index < 334223:
        return 16, 24200
    elif index < 358423:
        return 17, 24200
    elif index < 382623:
        return 18, 24200
    elif index < 406823:
        return 19, 24200
    elif index < 418928:
        return 20, 12105
    elif index < 430425:
        return 21, 11497
    elif index < 446988:
        return 22, 16563
    elif index < 452037:
        return 23, 5049
    elif index < 456571:
        return 24, 4534

## Split the data into training and development
#### Dev is saved to file, along with dev answers, new training array is returned

In [7]:
def train_dev_split(train_array, percent_split, vocab_df, dev_filename, answers_filename):
    dev_file = open(dev_filename,'w',encoding='utf8')
    dev_answers = open(answers_filename,'w',encoding='utf8')
    dev_answers.write("Id,Expected\n")
    new_train = []

    count = 1
    vocab_word = 0
    for index,training_line in enumerate(train_array):
        vocab_word, vocab_count = which_vocab(index)  
        if random.random() < percent_split/100:
            test_line, prob = create_test_example(" ".join(training_line),vocab_df[vocab_word])
            dev_answers.write(str(count) + "," + str(prob) + "\n")
            dev_file.write(test_line + "\n")
            count += 1
        else:
            new_train.append(training_line)
        
    dev_file.close()
    dev_answers.close()
    
    return new_train

### Extract N-gram choices from the test data

In [8]:
def prepare_test_data(filename, N):
    choices=[]
    reg_exp_str = '\{(.*)\|(.*)\}'
    pre_string = ''
    post_string = ''
    for gram in range(1,N):
        pre_string += '<s> '
        post_string += ' </s>'
        if gram % 2 == 0:
            reg_exp_str = reg_exp_str + ' ([^ ]+)'
        else:
            reg_exp_str = '([^ ]+) ' + reg_exp_str
    
    reg_exp = re.compile(reg_exp_str)
    with open(filename, encoding="utf8") as file:
        for row in file:
            row = ud.normalize("NFC",row)
            row = re.sub('[–*:&()?@#!/,."“”]','',row)
            row = re.sub('\s+',' ',row)
            row = row.lower()
            row = pre_string + row.strip() + post_string
            #extract options
            match = reg_exp.search(row)
            if match:
                if N%2 ==0:
                    midpoint = math.ceil((N+2)/2)
                else:
                    midpoint = math.ceil(N/2)
                choice_1 = match.group(midpoint)
                #print(choice_1)
                choice_2 = match.group(midpoint+1)
                #print(choice_2)
                for match_group in range(midpoint+2,N+2):
                    choice_1 += ' ' + match.group(match_group)
                    choice_2 += ' ' + match.group(match_group)
                for match_group in range(midpoint-1,0,-1):
                    choice_1 = match.group(match_group) + ' ' + choice_1
                    choice_2 = match.group(match_group) + ' ' + choice_2
                choice = (choice_1, choice_2)
            else:
                print('error, no value')
                print(reg_exp," ",row)
                choice = 'error'
            choices.append(choice)
    return choices

### Actually create the dictionary with all n-grams

In [9]:
def train_model(train_data, N):
    model = {}
    for line in train_data:
        for each_N in range(1,N+1):
            #for each line, generate all ngrams
            for index in range(0,len(line)-each_N):
                ngram = line[index]
                for n_forward in range(1,each_N):
                    ngram += ' ' + line[index+n_forward]
                if ngram in model:
                    model[ngram] += 1
                else:
                    model[ngram] = 1
    return model            

In [27]:
train = prepare_data('train.txt',3)
model =train_model(train,3)

## Probability function to determine likelihood of option_1 and option_2

In [10]:
def basic_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
        
    if count_1 == 0 and count_2 == 0:
        count_1 = 1
        count_2 = 1    
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    return prob_1, prob_2

In [38]:
def backoff_probability(option_1,option_2,model):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
    
    if count_1 == 0 and count_2 == 0:
        split_1 = option_1.split()
        split_2 = option_2.split()
        if len(split_1) > 1 and len(split_2) > 1:
            if len(split_1) % 2 ==0:
                split_1.pop(0)
                split_2.pop(0)
            else:
                split_1.pop()
                split_2.pop()
            new_option_1 = " ".join(split_1)
            new_option_2 = " ".join(split_2)
            return backoff_probability(new_option_1,new_option_2,model)
            
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2)
    prob_2 = count_2/(count_1+count_2)
    
    return prob_1, prob_2

In [12]:
def weighted_probability(option_1,option_2,model,weight):
    if option_1 in model:
        count_1 = model[option_1]
    else:
        count_1 = 0
        
    if option_2 in model:
        count_2 = model[option_2]
    else:
        count_2 = 0
        
    if count_1 == 0 and count_2 == 0:
        count_1 = 1
        count_2 = 1    
    elif count_1 == 0:
        count_1 = .1
    elif count_2 == 0:
        count_2 = .1
        
    prob_1 = count_1/(count_1+count_2) *weight
    prob_2 = count_2/(count_1+count_2) *weight
    
    return prob_1, prob_2

In [52]:
def interpolate_probability(option_1,option_2,model,weights):
    prob_1_arr=[]
    prob_2_arr=[]
    n_option_1 = option_1
    n_option_2 = option_2
    
    while True:
        # get basic probabilty and add to overall
        c_prob_1, c_prob_2 = backoff_probability(n_option_1,n_option_2,model)
        #c_prob_1, c_prob_2 = basic_probability(n_option_1,n_option_2,model)
        prob_1_arr.append(c_prob_1)
        prob_2_arr.append(c_prob_2)
        
        # split the options
        split_1 = n_option_1.split()
        split_2 = n_option_2.split()
        
        #get option lengths
        len_1 = len(split_1)
        len_2 = len(split_2)
        
        if len_1 <= 1 or len_2 <= 1:
            break
        
        if len(split_1) % 2 ==0:
            split_1.pop(0)
            split_2.pop(0)
        else:
            split_1.pop()
            split_2.pop()
        n_option_1 = " ".join(split_1)
        n_option_2 = " ".join(split_2)   
    
    #multiply probabilities by weights
    prob_1 = np.dot(weights,prob_1_arr)
    prob_2 = np.dot(weights,prob_2_arr)
    
    return prob_1, prob_2

In [53]:
# start = time.time()
# prob = interpolate_probability(choices[4937][0],choices[4937][1],model,[.1,.9,0])
# end = time.time()
# print(end-start, prob)

In [54]:
#expected_runtime = (end-start)*20000
#expected_runtime/60

# Try AdaBoost

In [18]:
# def ada_boost(train):
#     train_data = prepare_data(train,N)
#     vocab = create_vocab()
#     new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
#     choices = prepare_test_data(dev_test,N)
    
#     results_array = []
#     #initialize weights
#     weights = []
#     for x in range(0,len(train)):
#         weights.append(1/len(train))
        
#     for trial in range(0,K):
#         #train unigram with weights
#         model = train_model(new_train,N)
#         results = evaluate_weighted_model(model,choices,weights)
#         results_array.append(results)
#         success = write_output(output_name,results)
#         score = evaluate_results(output_name,dev_answers)
#         adaptive_param = 0.5*log((1-score)/score)
#         weights = 1/Z * weights * exp(adaptive_param)
        
    
#     train_data = prepare_data(train,N)
#     vocab = create_vocab()
#     new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
#     model = train_model(new_train,N)
#     choices = prepare_test_data(dev_test,N)
#     results = evaluate_interp_model(model,choices,weights)
#     success = write_output(output_name,results)
#     score = evaluate_results(output_name,dev_answers)
        

### Model Evaluation Options

In [15]:
def evaluate_model(model, choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = basic_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [43]:
def evaluate_backoff_model(model,choices):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = backoff_probability(choice[0],choice[1],model)
        results.append([index+1,c1])
    return results

In [17]:
def evaluate_interp_model(model, choices, weights):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = interpolate_probability(choice[0],choice[1],model, weights)
        results.append([index+1,c1])
    return results

In [18]:
def evaulate_weighted_model(model, choices, weights):
    results = [['Id','Expected']]
    for index, choice in enumerate(choices):
        c1,c2 = weighted_probability(choice[0],choice[1],model, weights[index])
        results.append([index+1,c1])
    return results

### Write output to a file

In [19]:
def write_output(filename,results):
    out_file = open(filename,'w')
    count =0
    for line in results:
        output = str(line[0]) + "," + str(line[1]) + "\n"
        out_file.write(output)
    out_file.close()
    return 1 

### Compare predictions to test file

In [20]:
def evaluate_results(prediction_file,actual_file):
    with open(prediction_file) as file:
        reader = csv.reader(file)
        predictions = list(reader)
    
    with open(actual_file) as file:
        reader = csv.reader(file)
        actual = list(reader)
    
    if len(actual) != len(predictions):
        print("Error: Files not the same length")
        return
    
    actual = np.array(actual)
    actual = actual[1:,1].astype(np.float64)
    predictions = np.array(predictions)
    predictions = predictions[1:,1].astype(np.float64)
    
    log_loss_score = log_loss(actual, predictions, eps=1e-15)
        
    return log_loss_score

In [27]:
#evaluate_results("First_Dev_Test.csv","dev_answers.csv")

In [21]:
def full_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_model(model,choices)
    write_output(output_name,results)

In [40]:
def full_backoff_pipeline(output_name,train,test,N):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_backoff_model(model,choices)
    write_output(output_name,results)

In [23]:
def full_interp_pipeline(output_name,train,test,N,weights):
    train_data = prepare_data(train,N)
    model = train_model(train_data,N)
    choices = prepare_test_data(test,N)
    results = evaluate_interp_model(model,choices,weights)
    write_output(output_name,results)

In [198]:
#full_pipeline("trigram_with_bold_reassignment.csv","train.txt","test.txt",3)

In [269]:
full_interp_pipeline("optimized_weight_hexagram.csv","train.txt","test.txt",6,[0, 0.1, 0.1, .7, .1, 0])

### Full Development Pipeline to Train model and get scores

With basic probability

In [24]:
def full_dev_basic_pipeline(output_name,train,dev_test,dev_answers,N,percent_split):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    model = train_model(new_train,N)
    choices = prepare_test_data(dev_test,N)
    results = evaluate_model(model,choices)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [35]:
basic_prob_score = full_dev_basic_pipeline("basic_dev.csv","train.txt","dev_test.txt","dev_answers.csv",2,10)

In [36]:
print(basic_prob_score)

0.21733825302325682


In [25]:
def full_dev_pipeline(output_name,train,dev_test,dev_answers,N,percent_split,weights):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    model = train_model(new_train,N)
    choices = prepare_test_data(dev_test,N)
    results = evaluate_interp_model(model,choices,weights)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [44]:
def full_dev_backoff_pipeline(output_name,train,dev_test,dev_answers,N,percent_split):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    model = train_model(new_train,N)
    choices = prepare_test_data(dev_test,N)
    results = evaluate_backoff_model(model,choices)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [45]:
backoff_prob_score = full_dev_backoff_pipeline("backoff_dev.csv","train.txt","dev_test.txt","dev_answers.csv",2,10)

In [46]:
print(backoff_prob_score)

0.19763851595243404


In [47]:
backoff_prob_score = full_dev_backoff_pipeline("backoff_dev.csv","train.txt","dev_test.txt","dev_answers.csv",3,10)

In [48]:
print(backoff_prob_score)

0.1634193353534091


In [49]:
backoff_prob_score = full_dev_backoff_pipeline("backoff_dev.csv","train.txt","dev_test.txt","dev_answers.csv",4,10)

In [50]:
print(backoff_prob_score)

0.16997126864954332


In [55]:
interp_score = full_dev_pipeline("interp_dev.csv","train.txt","dev_test.txt","dev_answers.csv",4,10,[0.2, 0.7, 0.1, 0])

In [56]:
print(interp_score)

0.1663743322351299


In [33]:
# for x in np.arange(0,1.1,0.1):
#     if x == 1:
#         weights = [x,0,0]
#         print(weights,full_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",3,10,weights))
#     for y in np.arange (0,1.05-x,0.1):
#         z = 1-y-x
#         weights = [x,y,z]
#         print(weights,full_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",3,10,weights))

In [34]:
# model,choices,output_name,dev_answers = half_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",5,10)
# for x in np.arange(0.1,1.1,0.1):
#     if x == 1:
#         weights = [x,0,0,0,0]
#         print(weights, rest_of_dev(model,choices,output_name,dev_answers,weights))
#     for y in np.arange (0,1.05-x,0.1):
#         if y ==1:
#             weights = [0,y,0,0,0]
#             print(weights,rest_of_dev(model,choices,output_name,dev_answers,weights))
#         for z in np.arange(0,1.05-x-y,0.1):
#             if z ==1:
#                 weights = [0,0,z,0,0]
#                 print(weights,rest_of_dev(model,choices,output_name,dev_answers,weights))
#             for a in np.arange(0,1.05-x-y-z,0.1):
#                 b = 1-x-y-z-a
#                 weights = [x,y,z,a,b]
#                 print(weights,rest_of_dev(model,choices,output_name,dev_answers,weights))

In [35]:
#full_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",4,10,[0.2,0.7,0.1,0])

In [27]:
def half_dev_pipeline(output_name,train,dev_test,dev_answers,N,percent_split):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    model = train_model(new_train,N)
    choices = prepare_test_data(dev_test,N)
    return model,choices,output_name,dev_answers

In [28]:
def rest_of_dev_interp(model,choices,output_name,dev_answers,weights):
    results = evaluate_interp_model(model,choices,weights)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [29]:
def rest_of_dev_backoff(model,choices,output_name,dev_answers,delta):
    results = evaluate_backoff_model(model,choices,delta)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [30]:
def rest_of_dev_basic(model,choices,output_name,dev_answers):
    results = evaluate_backoff_model(model,choices,delta)
    success = write_output(output_name,results)
    score = evaluate_results(output_name,dev_answers)
    return score

In [263]:
model,choices,output_name,dev_answers = half_dev_pipeline("a.csv","train.txt","dev_test.txt","dev_answers.csv",4,10)

In [267]:
weights = [0.2, .7, 0.1, 0]
#weights = [.4, .6, 0]

In [268]:
#print("backoff: ",rest_of_dev_backoff(model,choices,output_name,dev_answers,1))
print("interpo: ",rest_of_dev_interp(model,choices,output_name,dev_answers,weights))
#print("basic: ",rest_of_dev_basic(model,choices,output_name,dev_answers))

interpo:  0.16426568463277524


### Try Bagging

In [31]:
def bootstrap_resample(train_data):
    train_size = len(train_data)
    new_train = []
    for x in range(0,len(train_data)):
        new_train.append(train_data[math.floor(random.random()*train_size)])
    return new_train

In [32]:
def bagging_test(output_name,train,dev_test,dev_answers,N,percent_split,weights,bags):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    new_train = train_dev_split(train_data, percent_split, vocab, dev_test, dev_answers)
    overall_results_array = []
    mean_array = [['ID','Expected']]
    choices = prepare_test_data(dev_test,N)
    for bag in range(0,bags):
        bootstrapped_train = bootstrap_resample(new_train)
        model = train_model(bootstrapped_train,N)
        results = evaluate_interp_model(model,choices,weights)
        overall_results_array.append(results)
    for x in range(1,len(results)):
        mean = 0
        for result_array in overall_results_array:
            mean += result_array[x][1]
        mean = mean/len(overall_results_array)
        mean_array.append([x,mean])
    success = write_output(output_name,mean_array)
    score = evaluate_results(output_name,dev_answers)
    return score

In [85]:
score = bagging_test("a.csv","train.txt","dev_test.txt","dev_answers.csv",4,10,[.2, .7, .1, 0],5)

In [86]:
score

0.16990262608403892

In [33]:
def bagging_real(output_name,train,test,N,weights,bags):
    train_data = prepare_data(train,N)
    vocab = create_vocab()
    overall_results_array = []
    mean_array = [['ID','Expected']]
    choices = prepare_test_data(test,N)
    for bag in range(0,bags):
        print(bag)
        bootstrapped_train = bootstrap_resample(train_data)
        model = train_model(bootstrapped_train,N)
        results = evaluate_interp_model(model,choices,weights)
        overall_results_array.append(results)
    for x in range(1,len(results)):
        mean = 0
        for result_array in overall_results_array:
            mean += result_array[x][1]
        mean = mean/len(overall_results_array)
        mean_array.append([x,mean])
    success = write_output(output_name,mean_array)
    return output_name

In [87]:
bagging_real("bagging_quad.csv","train.txt","test.txt",4,[0.2,0.7,0.1,0],20)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


'bagging_quad.csv'

In [34]:
def compile_best_solutions(names):
    all_predictions = []
    mean_array = [['ID','Expected']]
    pred_len = 0
    for name in names:
        with open(name) as file:
            reader = csv.reader(file)
            predictions = list(reader)
            all_predictions.append(predictions)
            pred_len = len(predictions)
    for x in range(1,pred_len):
        mean = 0
        for ind,pred_list in enumerate(all_predictions):
            mean += float(pred_list[x][1])
        mean = mean/len(all_predictions)
        mean_array.append([x,mean])
    
    return mean_array

In [12]:
names = ['optimized_weight_hexagram.csv','it_was_the_wrong_index.csv','a_very_bold_trigram.csv']

In [13]:
results = compile_best_solutions(names)

In [14]:
results

[['ID', 'Expected'],
 [1, 0.9902526772431731],
 [2, 0.9495254390449303],
 [3, 0.9411510586012839],
 [4, 0.9559690271916738],
 [5, 0.9658400650826944],
 [6, 0.9950842074229737],
 [7, 0.9389658603849961],
 [8, 0.9877266084817031],
 [9, 0.9870176006521195],
 [10, 0.9364079600423568],
 [11, 0.9766737176103147],
 [12, 0.9723840742544253],
 [13, 0.9506495490478244],
 [14, 0.9736946227633769],
 [15, 0.8839469367993612],
 [16, 0.9940421472112305],
 [17, 0.9799335029948293],
 [18, 0.2839716664019319],
 [19, 0.9560887692842396],
 [20, 0.9878330523390538],
 [21, 0.9799378921526468],
 [22, 0.9835057394802732],
 [23, 0.9740191527770645],
 [24, 0.9941199842399434],
 [25, 0.9946203553703828],
 [26, 0.5956591924601861],
 [27, 0.9821497610433632],
 [28, 0.9750474267279904],
 [29, 0.9355650514103656],
 [30, 0.9901492959492525],
 [31, 0.995169496393053],
 [32, 0.9331295227847091],
 [33, 0.9619179416051186],
 [34, 0.9923145361365213],
 [35, 0.982560911074112],
 [36, 0.9814080967484001],
 [37, 0.9904359379

In [15]:
write_output('combined_prob_and_log_reg_no_quadgram.csv',results)

1