In [9]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud
import csv
import random
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from gensim.models import Word2Vec

# Import training data for vectorizing

We start by converting the given sentences to an array of word arrays, with an appropriate number of pre and post-string characters

In [10]:
def file_to_split_sentences(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename, encoding="utf8") as file:
        for line in file:
            line = ud.normalize("NFC",line)
            line = re.sub('[,.?"“”]','',line)
            line = re.sub('\s+',' ',line)
            line = line.lower()
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    return train_dat

In [11]:
#split_train= file_to_split_sentences('train.txt',5)

Next, we use the word2Vec program to find the 10,000 most common words and store this as 'set_vocab' 
The length of the vocab is checked to ensure it is close to 10000

In [12]:
model=Word2Vec(split_train, size = 200, max_final_vocab=10000)
set_vocab = model.wv.vocab
print(len(set_vocab))

NameError: name 'split_train' is not defined

We then want to modify the training array to replace unkown words with the "UNK" character and then retrain the word2vec model

In [13]:
def vocab_preprocess(split_file,set_vocab,vocab_pairs=[]):
    new_train_vec=[]
    for line in split_file:
        new_line=[]
        for word in line:
            if word in set_vocab or word in vocab_pairs:
                new_line.append(word)
            else:
                new_line.append('<UNK>')
        new_train_vec.append(new_line)
    return new_train_vec

In [14]:
new_train_vec = vocab_preprocess(split_train,set_vocab)
#Uncomment line below to check for proper tokenization
new_train_vec[0]
new_model=Word2Vec(new_train_vec, size = 200, min_count=1)
#verify properties of the new model
print(new_model)

NameError: name 'split_train' is not defined

### Split the data by vocab

In [15]:
# function to determine which vocab word an index is in the training data
def which_vocab(index):
    if index < 24200:
        return 0, 24200
    elif index < 48400:
        return 1, 24200
    elif index < 51290:
        return 2, 2890
    elif index < 75490:
        return 3, 24200
    elif index < 99690:
        return 4, 24200
    elif index < 123890:
        return 5, 24200
    elif index < 131159:
        return 6, 7269
    elif index < 155359:
        return 7, 24200
    elif index < 179559:
        return 8, 24200
    elif index < 203759:
        return 9, 24200
    elif index < 227959:
        return 10, 24200
    elif index < 252159:
        return 11, 24200
    elif index < 258227:
        return 12, 6068
    elif index < 282427:
        return 13, 24200
    elif index < 306627:
        return 14, 24200
    elif index < 310023:
        return 15, 3396
    elif index < 334223:
        return 16, 24200
    elif index < 358423:
        return 17, 24200
    elif index < 382623:
        return 18, 24200
    elif index < 406823:
        return 19, 24200
    elif index < 418928:
        return 20, 12105
    elif index < 430425:
        return 21, 11497
    elif index < 446988:
        return 22, 16563
    elif index < 452037:
        return 23, 5049
    elif index < 456571:
        return 24, 4534

#### Import Vocab

In [16]:
def create_vocab():
    with open('vocab.csv') as file:
        reader = csv.reader(file)
        vocab = list(reader)
    return vocab

In [17]:
vocab = create_vocab()

Then split the data by the possible vocab words

In [18]:
def split_by_vocab(train_array):
    vocab_train_array = []
    count = 1
    vocab_word = 0
    temp_train=[]
    for index,training_line in enumerate(train_array):
        new_vocab_word, vocab_count = which_vocab(index)
        if new_vocab_word != vocab_word:
            vocab_train_array.append(temp_train)
            temp_train = []
            vocab_word=new_vocab_word
        temp_train.append(training_line)
    vocab_train_array.append(temp_train)
    return vocab_train_array

In [19]:
split_vocab_data = split_by_vocab(split_train)
#uncomment below to check for correct split
for x,array in enumerate(split_vocab_data):
    print(vocab[x], len(array))

NameError: name 'split_train' is not defined

First we define a function to check if a word has lenition or eclipsis

In [20]:
def check_mutations(word):
    lenition =0
    eclipsis = 0
    eclipsis_list=('mb','gc','nd','ng','bhf','bp','dt','n-a','n-á','n-e','n-é','n-i','n-í','n-o','n-ó','n-u','n-ú','n-y')
            
    if word.startswith(eclipsis_list):
        eclipsis=1
    elif len(word)>2 and word[1]=='h':
        lenition=1
    
    return (lenition,eclipsis)

We then want to create labelled examples from the training data

In [21]:
def train_to_labeled(single_vocab_array, vocab, N, mutations=True):
    labeled_train = []
    for index,training_line in enumerate(single_vocab_array):
        label = -1
        target_word = -1
        if vocab[0] in training_line:
            label = 1
            target_word = training_line.index(vocab[0])
        elif vocab[1] in training_line:
            label = 0
            target_word = training_line.index(vocab[1])
            
        if label==-1 or target_word ==-1:
            print('error, desired vocab not found')
            return 0
        
        # set number of previous and following words to capture
        max_previous = target_word - math.ceil(N/2)
        max_forward = target_word + math.floor(N/2)
        ngram = [training_line[x] for x in range(max_previous,max_forward) if x != target_word]
        if mutations:
            lenition,eclipsis=check_mutations(ngram[math.ceil(len(ngram)/2)])
            ngram.append(lenition)
            ngram.append(eclipsis)
        ngram.append(label)
        labeled_train.append(ngram)
    
    return labeled_train

In [22]:
labeled_data = train_to_labeled(split_vocab_data[0],vocab[0],5)
#uncomment to check for proper labeled data
labeled_data[1]

NameError: name 'split_vocab_data' is not defined

Then the labelled data needs to be vectorized. This returns a vector of vector_size x (N_gram-1) dimensions

In [23]:
def vectorize_ngram(labeled_data, word2vec_model, set_vocab,vocab_pairs=[]):
    vectorized_list=[]
    labels=[]
    for line in labeled_data:
        label = line[-1]
        eclipsis=line[-2]
        lenition=line[-3]
        vectorized_data = []
        for x in line[0:-3]:
            if x not in word2vec_model.wv:
                x = '<UNK>'
            if x in word2vec_model.wv or x in vocab_pairs:
                vectorized_data.extend(word2vec_model.wv[x])
            else:
                vectorized_data.extend(np.zeros(model.trainables.layer1_size))
        vectorized_data.append(lenition)
        vectorized_data.append(eclipsis)
            
        vectorized_list.append(vectorized_data)
        labels.append(label)
    return (vectorized_list,labels)

In [24]:
X,y = vectorize_ngram(labeled_data,new_model,set_vocab)

NameError: name 'labeled_data' is not defined

In [25]:
len(X[1])

NameError: name 'X' is not defined

In [26]:
def vectorize_rnn_ngram(labeled_data, word2vec_model, set_vocab,vocab_pairs=[]):
    vectorized_list=[]
    labels=[]
    for line in labeled_data:
        label = line[-1]
        vectorized_data = []
        for x in line[0:-1]:
            if x not in word2vec_model.wv:
                x = '<UNK>'
            if x in word2vec_model.wv or x in vocab_pairs:
                vectorized_data.append(word2vec_model.wv[x])
            else:
                vectorized_data.append(np.zeros(model.trainables.layer1_size))      
        vectorized_list.append(vectorized_data)
        labels.append(label)
    return (vectorized_list,labels)

Now that the data is properly vectorized, we want to split into a training and development set

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
#uncomment to check for proper lengths of test and training set
print(len(X),len(X_train),len(X_test))

NameError: name 'X' is not defined

## Combine the above preprocessing steps into a single function

In [28]:
def train_data_prep(filename,N,vec_size,max_vocab):
    split_train=file_to_split_sentences(filename,N)
    model=Word2Vec(split_train,size=vec_size,max_final_vocab=max_vocab)
    set_vocab=model.wv.vocab
    new_train_vec=vocab_preprocess(split_train,set_vocab)
    new_model=Word2Vec(new_train_vec,size=vec_size,max_final_vocab=max_vocab)
    vocab = create_vocab()
    split_vocab_data = split_by_vocab(split_train)
    X_train_list=[]
    X_dev_list=[]
    y_train_list=[]
    y_dev_list=[]
    for index,vocab_data in enumerate(split_vocab_data):
        labeled_data=train_to_labeled(vocab_data,vocab[index],N)
        X,y = vectorize_ngram(labeled_data,new_model,set_vocab)
        X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.20, random_state=42)
        X_train_list.append(X_train)
        X_dev_list.append(X_dev)
        y_train_list.append(y_train)
        y_dev_list.append(y_dev)
    return (X_train_list,X_dev_list,y_train_list,y_dev_list)

In [33]:
X_train_list,X_dev_list,y_train_list,y_dev_list = train_data_prep('train.txt',5,200,10000)

This is a new pipeline for training data to be fed to an RNN

In [29]:
def train_rnn_data_prep(filename,N,vec_size,max_vocab):
    split_train=file_to_split_sentences(filename,N)
    model=Word2Vec(split_train,size=vec_size,max_final_vocab=max_vocab)
    set_vocab=model.wv.vocab
    new_train_vec=vocab_preprocess(split_train,set_vocab)
    new_model=Word2Vec(new_train_vec,size=vec_size,max_final_vocab=max_vocab)
    vocab = create_vocab()
    split_vocab_data = split_by_vocab(split_train)
    X_train_list=[]
    X_dev_list=[]
    y_train_list=[]
    y_dev_list=[]
    for index,vocab_data in enumerate(split_vocab_data):
        labeled_data=train_to_labeled(vocab_data,vocab[index],N,mutations=False)
        X,y = vectorize_rnn_ngram(labeled_data,new_model,set_vocab)
        X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.20, random_state=42)
        X_train_list.append(X_train)
        X_dev_list.append(X_dev)
        y_train_list.append(y_train)
        y_dev_list.append(y_dev)
    return (X_train_list,X_dev_list,y_train_list,y_dev_list)

In [30]:
X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn = train_rnn_data_prep('train.txt',5,200,10000)

## Starting to apply ML algorithms

Now that we have vectorized data, we can start applying ML algorithms using sklearn. Note, we need to train 25 individual classifiers- one for each of the vocab words

In [34]:
#Define function to set X,y train and dev
def get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,index=0):
    return (X_train_list[index],X_dev_list[index],y_train_list[index],y_dev_list[index])

In [37]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)

First, lets start with a logistic regressor

In [38]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=0.001,random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_dev)
#evaluate performance using log_loss
print(log_loss(y_dev,y_pred))

0.2948328172350819


If evaluation on one vocab word looks good, lets try it on all vocab

In [44]:
# define a function to test all of the vocab given a model
def test_all_vocab(classifier):
    import statistics
    y_dev_losses=[]
    y_train_losses=[]
    for x in range(0,25):
        X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,x)
        classifier.fit(X_train,y_train)
        y_pred = classifier.predict_proba(X_dev)
        y_train_pred = classifier.predict_proba(X_train)
        #evaluate performance using log_loss
        train_loss=log_loss(y_train,y_train_pred)
        dev_loss=log_loss(y_dev,y_pred)
        y_dev_losses.append(dev_loss)
        y_train_losses.append(train_loss)
        
        print('Train loss: ',train_loss,' Val loss: ',dev_loss)
    print('Train loss: ',statistics.mean(y_train_losses),' Val loss: ',statistics.mean(y_dev_losses))

In [49]:
clf = LogisticRegression(C=0.001,random_state=0, solver='lbfgs')
test_all_vocab(clf)

Train loss:  0.05926530865696404  Val loss:  0.06897391022344879
Train loss:  0.0511431767767427  Val loss:  0.047403833913009764
Train loss:  0.25843892366018606  Val loss:  0.2948328172350819
Train loss:  0.0902894573080811  Val loss:  0.08471233320408933
Train loss:  0.034367420244572434  Val loss:  0.03928544736386444
Train loss:  0.0668311699897007  Val loss:  0.06250380816839228
Train loss:  0.2847768720033905  Val loss:  0.32178525529667645
Train loss:  0.13820219544895246  Val loss:  0.1413046710539599
Train loss:  0.08477643321962271  Val loss:  0.09306980339354773
Train loss:  0.030256108037642077  Val loss:  0.036460564256765145
Train loss:  0.2657636969747682  Val loss:  0.2866241989376655
Train loss:  0.08108906425617594  Val loss:  0.09222721743653677
Train loss:  0.08740586806439218  Val loss:  0.10920184497923477
Train loss:  0.05081754612564946  Val loss:  0.04973762828894898
Train loss:  0.20932555006469156  Val loss:  0.21056522584316295
Train loss:  0.40607723866542

Let's also take a peek at the predictions the model is making.
If these values are too close to 1 or 0, it may indicate overfitting

In [None]:
print(y_pred[0:10])

Logistic regression looks pretty good, so lets see how logistic regression with CV performs

In [None]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(Cs=np.arange(0.001,0.01,0.001),cv=5,random_state=0, solver='lbfgs',max_iter=200, scoring='neg_log_loss')
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_dev)
#evaluate performance using log_loss
print(log_loss(y_dev,y_pred))
print(clf.C_)

In [None]:
clf.scores_

Again, this looks reasonable, so let's try it with all the vocab

In [None]:
clf = LogisticRegressionCV(Cs=[0.00001,0.0001,0.001],cv=5,random_state=0, solver='lbfgs',max_iter=200)
for x in range(0,25):
    X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,x)
    clf.fit(X_train, y_train)
    y_pred=clf.predict_proba(X_dev)
    #evaluate performance using log_loss
    print(log_loss(y_dev,y_pred),clf.C_)

* 0.0642826738215069 | 0.00599484
* 0.04229903577299008 | 0.00599484
* 0.249076453782974 | 0.00599484
* 0.08190084583878704 | 0.00599484
* 0.03096742615321697 | 0.04641589
* 0.044940145400178114 | 0.04641589
* 0.2887409855573007 | 0.04641589
* 0.12843203781524062 | 0.00599484
* 0.08706723982279557 | 0.00599484
* 0.02103945686975264 | 0.04641589
* 0.28598412543703405 | 0.00599484
* 0.07827273100206689 | 0.00599484
* 0.10526719899321839 | 0.00599484
* 0.03778233770808103 | 0.04641589
* 0.20574933511333274 | 0.00599484
* 0.4577377917069274 | 0.0464158
* 0.1494325625192918 | 0.04641589
* 0.06065848401236185 | 0.00599484
* 0.04519150781077979 | 0.04641589
* 0.28603926814517255 | 0.00599484
* 0.14627178136852173 | 0.00599484
* 0.19339296575800236 | 0.00599484
* 0.08455027218569924 | 0.00599484
* 0.11692524801390788 | 0.00599484
* 0.09163245328102149 | 0.00077426


Looks like 0.006 is a reasonable choice for C, and we can stick with just plain linear regression

Next lets try a naive_bayes model

In [59]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict_proba(X_dev)
Y_train_pred=gnb.predict_proba(X_train)

In [60]:
print(log_loss(y_dev,y_pred))

2.5545733030767206


In [61]:
print(log_loss(y_train,Y_train_pred))

2.4572312020490226


This model is just a poor fit for this data

Lets give a random forest a try

In [65]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000, max_depth=4,random_state=0)
rfc.fit(X_train,y_train)
y_train_pred=rfc.predict_proba(X_train)
y_pred = rfc.predict_proba(X_dev)

In [66]:
print(log_loss(y_train,y_train_pred))
print(log_loss(y_dev,y_pred))

0.3395842600818164
0.36933733956990483


Once again, this seems like a promising option, so lets try on all vocab

In [67]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=4,random_state=0)
test_all_vocab(rfc)

Train loss:  0.07729325174759674  Val loss:  0.0889130517898909
Train loss:  0.057677241105136993  Val loss:  0.054410865651567754
Train loss:  0.34123919444475176  Val loss:  0.3682932570420379
Train loss:  0.10067228716088573  Val loss:  0.09744645573457894
Train loss:  0.04133242650791395  Val loss:  0.045515030557748984
Train loss:  0.09788706554453475  Val loss:  0.08896222534068549
Train loss:  0.3682677275683832  Val loss:  0.3852213233709328
Train loss:  0.26738266335208793  Val loss:  0.26324465945315884
Train loss:  0.10842978114980512  Val loss:  0.11633480548777697
Train loss:  0.052866802168460035  Val loss:  0.06298578553478376
Train loss:  0.2991935974151097  Val loss:  0.3104345083179092
Train loss:  0.12170242873848708  Val loss:  0.12709269918970287
Train loss:  0.08417111710599486  Val loss:  0.11114224108008605
Train loss:  0.07257755760037701  Val loss:  0.07281442802186473
Train loss:  0.29224022410996536  Val loss:  0.2880289634083919
Train loss:  0.4902269943335

Again, this looks pretty good with minimal effort, so it may be good to explore further. Logistic regression seems to still have it beat though

Lets try out an extra trees classifier

In [76]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=1000, max_depth=2,random_state=0)
etc.fit(X_train,y_train)
y_train_pred=etc.predict_proba(X_train)
y_pred = etc.predict_proba(X_dev)

In [77]:
print(log_loss(y_train,y_train_pred))
print(log_loss(y_dev,y_pred))

0.573918133441894
0.5807373699481648


Not bad, let's give this a go also

In [None]:
etc = RandomForestClassifier(n_estimators=100, max_depth=4,random_state=0)
test_all_vocab(etc)

This also has relatively good performance!

Finally, lets give an SVM a try

In [79]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from sklearn.svm import SVC
#from sklearn.preprocessing import StandardScaler
#scaler=StandardScaler()
svm = SVC(C=0.001,gamma='auto',probability=True,random_state=0)
#svm.fit(scaler.fit_transform(X_train),y_train)
svm.fit(X_train,y_train)
y_train_pred=svm.predict_proba(X_train)
y_pred = svm.predict_proba(X_dev)

In [80]:
print(log_loss(y_train,y_train_pred))
print(log_loss(y_dev,y_pred))

0.3754262517132494
0.4201871857408247


Not bad, worth a shot

In [82]:
# svm = SVC(C=0.0001,gamma='auto',probability=True,random_state=0)
# test_all_vocab(svm)

### Neural Networks

For fun, lets try a very basic neural network

In [None]:
np.reshape(X_train[0],(1,-1))

In [173]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,2)
from keras.models import Sequential
from keras.layers import Dense, Dropout
keras_model = Sequential()
keras_model.add(Dense(300, input_shape = (len(X_train[0]),),activation='relu'))
keras_model.add(Dropout(0.9)) 
keras_model.add(Dense(300, activation='relu'))
keras_model.add(Dense(1, activation='sigmoid'))
keras_model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [174]:
keras_model.fit(np.array(X_train), np.array(y_train), epochs=10,validation_data=(np.array(X_dev),np.array(y_dev)))

Train on 2312 samples, validate on 578 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f59431b1b00>

In [168]:
train_scores = scores = keras_model.evaluate(np.array(X_train),np.array(y_train))
dev_scores = keras_model.evaluate(np.array(X_dev),np.array(y_dev))



In [155]:
print(train_scores,dev_scores)

[0.18341195347451422, 0.9662629757785467] [0.32745241108237666, 0.9100346020761245]


Get all of the rnn data

In [161]:
X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn = train_rnn_data_prep('train.txt',5,200,10000)

In [162]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn,2)
print(np.array(X_train).shape)
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Flatten
rnn_model = Sequential()
#rnn_model.add(Dense(128, input_shape = (len(X_train[0]),),activation='relu'))
rnn_model.add(LSTM(100,input_shape=(len(X_train[1]),len(X_train[0][0])),return_sequences=True,dropout=0.7,recurrent_dropout=0.7))
rnn_model.add(LSTM(50,return_sequences=True,dropout=0.7,recurrent_dropout=0.7))
rnn_model.add(LSTM(10,dropout=0.7,recurrent_dropout=0.7))
#rnn_model.add(Flatten())
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy',optimizer='adam')

(2312, 4, 200)


In [163]:
rnn_model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=5, validation_data=((np.array(X_dev),np.array(y_dev))))

Train on 2312 samples, validate on 578 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f595c48eeb8>

In [176]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn,2)
print(np.array(X_train).shape)
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Flatten
rnn_model = Sequential()
#rnn_model.add(Dense(128, input_shape = (len(X_train[0]),),activation='relu'))
rnn_model.add(LSTM(30,input_shape=(len(X_train[1]),len(X_train[0][0])),dropout=0.7,recurrent_dropout=0.7))
rnn_model.add(Dense(40, activation='tanh'))
#rnn_model.add(Flatten())
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy',optimizer='adam')

(2312, 4, 200)


In [177]:
rnn_model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=5, validation_data=((np.array(X_dev),np.array(y_dev))))

Train on 2312 samples, validate on 578 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f59428c60b8>

In [178]:
train_scores = scores = rnn_model.evaluate(np.array(X_train),np.array(y_train))
dev_scores = rnn_model.evaluate(np.array(X_dev),np.array(y_dev))



In [179]:
print(train_scores,dev_scores)

0.16922360038623266 0.28020085830718383


In [144]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn,2)
print(np.array(X_train).shape)
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Flatten
rnn_model = Sequential()
rnn_model.add(LSTM(8,input_shape=(len(X_train[1]),len(X_train[0][0])),dropout=0.7,recurrent_dropout=0.7,return_sequences=True))
rnn_model.add(LSTM(8,dropout=0.7,recurrent_dropout=0.7))
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy',optimizer='adam')

(2312, 6, 300)


In [145]:
rnn_model.fit(np.array(X_train), np.array(y_train), epochs=50, batch_size=5, validation_data=((np.array(X_dev),np.array(y_dev))))

Train on 2312 samples, validate on 578 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f9cd84a9550>

Let's run this through all the training

In [1]:
def train_rnn_all(X_train_list,X_dev_list,y_train_list,y_dev_list,vocab):
    y_predictions = []
    y_tests = []
    models=[]
    for x in range(0,len(vocab)):
        X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,x)
        rnn_model = Sequential()
        rnn_model.add(LSTM(8,input_shape=(len(X_train[1]),len(X_train[0][0])),dropout=0.7,recurrent_dropout=0.7,return_sequences=True))
        rnn_model.add(LSTM(8,dropout=0.7,recurrent_dropout=0.7))
        rnn_model.add(Dense(1, activation='sigmoid'))
        rnn_model.compile(loss='binary_crossentropy',optimizer='adam')
        rnn.fit(np.array(X_train),np.array(y_train))
        models.append(rnn)
        y_pred = rnn.predict(np.array(X_dev))
        y_predictions.extend(y_pred)
        y_tests.extend(y_dev)
    return (y_tests,y_predictions,models)

In [2]:
y_rnn_test,y_rnn_pred,rnn_models=train_rnn_all(X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn)

NameError: name 'X_train_rnn' is not defined

In [152]:
X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_rnn,X_dev_rnn,y_train_rnn,y_dev_rnn,15)
print(np.array(X_train).shape)
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Flatten
rnn_model = Sequential()
rnn_model.add(LSTM(8,input_shape=(len(X_train[1]),len(X_train[0][0])),dropout=0.7,recurrent_dropout=0.7,return_sequences=True))
rnn_model.add(LSTM(8,dropout=0.7,recurrent_dropout=0.7))
#rnn_model.add(Flatten())
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(loss='binary_crossentropy',optimizer='adam')

(2716, 6, 300)


In [153]:
rnn_model.fit(np.array(X_train), np.array(y_train), epochs=100, batch_size=100, validation_data=((np.array(X_dev),np.array(y_dev))))

Train on 2716 samples, validate on 680 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f9cd2d88240>

## Training on all the data

Let's write a quick function to train and store the models from all the data

In [None]:
def train_all(X_train_list,X_dev_list,y_train_list,y_dev_list,classifier,vocab):
    import pickle
    y_predictions = []
    y_tests = []
    models=[]
    for x in range(0,len(vocab)):
        X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,x)
        classifier.fit(np.array(X_train),np.array(y_train))
        s = pickle.dumps(classifier)
        models.append(s)
        y_pred = classifier.predict_proba(np.array(X_dev))
        y_predictions.extend(y_pred)
        y_tests.extend(y_dev)
    return (y_tests,y_predictions,models)

In [None]:
clf = LogisticRegression(C=0.001,random_state=0, solver='lbfgs')
#rfc = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
#clf = LogisticRegressionCV(Cs=[0.00001,0.0001,0.001],cv=5,random_state=0, solver='lbfgs',max_iter=200)
y_tests,y_pred,models=train_all(X_train_list,X_dev_list,y_train_list,y_dev_list,clf,vocab)
print(log_loss(y_tests,y_pred))

With models trained and stored in a list, we can then move on to processing the test data and applying these models

In [None]:
#get the pairs of vocab that appear in the test file
vocab_pairs = ['{'+ x[0]+'|'+x[1]+'}' for x in vocab]
#split the file to word arrays
test_data=file_to_split_sentences('test.txt',5)

In [None]:
modified_test_data[0]

In [None]:
# def split_test_by_vocab(test_data, vocab_pairs):
#     vocab_test_array = []
#     for x in range(0,len(vocab_pairs)):
#         vocab_test_array.append([])
#     for test_example in test_data:
#         for ind,word in enumerate(vocab_pairs):
#             if word in test_example:
#                 vocab_test_array[ind].append(test_example)
#     return vocab_test_array

In [None]:
# split_test_vocab = split_test_by_vocab(modified_test_data,vocab_pairs)

We need a function to convert the test data to ngrams, as it is in a different form than the training data

In [None]:
def test_to_ngram(test_array,vocab_pairs,N):
    ngram_test = []
    target_word = -1
    vocab_index = -1
    for test_example in test_array:
        for x in vocab_pairs:
            if x in test_example:
                target_word=test_example.index(x)
                vocab_index = vocab_pairs.index(x)
                break

        if target_word ==-1:
            print('error, desired vocab not found')
            return 0

        # set number of previous and following words to capture
        max_previous = target_word - math.ceil(N/2)
        max_forward = target_word + math.floor(N/2)
        ngram = [test_example[x] for x in range(max_previous,max_forward) if x != target_word]
        lenition,eclipsis=check_mutations(ngram[math.ceil(len(ngram)/2)])
        ngram.append(lenition)
        ngram.append(eclipsis)
        ngram.append(vocab_index)
        ngram_test.append(ngram)
    
    return ngram_test

In [None]:
ngram_test = test_to_ngram(modified_test_data,vocab_pairs,5)
ngram_test[1]

In [None]:
# def test_ngram_to_vec(ngram_test,word2vec_model):
#     vectorized_list=[]
#     for ngram in ngram_test:
#         vectorized_data = []
#         for word in ngram:
#             vectorized_data.extend(word2vec_model.wv[word]) 
#         vectorized_list.append(vectorized_data)
#     return vectorized_list

In [None]:
X_train_full = []
X_dev_full =[]
train_vocab_indices = []
dev_vocab_indices = []
y_train_full=[]
y_dev_full=[]
for x in range(0,len(vocab)):
    X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,x)
    X_train_full.extend(X_train)
    X_dev_full.extend(X_dev)
    y_train_full.extend(y_train)
    y_dev_full.extend(y_dev)
    train_vocab_indices.extend([x]*len(X_train))
    dev_vocab_indices.extend([x]*len(X_dev))
print(len(X_train_full),len(train_vocab_indices))


In [None]:
def predict_full_train(X_test_full,vocab_indices,models):
    import pickle
    results = []
    for index,line in enumerate(X_test_full):
        clf = pickle.loads(models[vocab_indices[index]])
        results.append((clf.predict_proba(np.reshape(line,(1,-1)))[0][1]))
    return results    

In [None]:
# X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,1)
# y_trial_pred = models[0].predict_proba(X_dev)
# log_loss(y_dev,y_trial_pred)
#X_train,X_dev,y_train,y_dev=get_single_vocab(X_train_list,X_dev_list,y_train_list,y_dev_list,0)

In [None]:
y_train_results = predict_full_train(X_train_full,train_vocab_indices,models)

In [None]:
len(y_train_results)

In [None]:
log_loss(y_train_full,y_train_results)

In [None]:
y_dev_results = predict_full_train(X_dev_full,dev_vocab_indices,models)

In [None]:
len(y_dev_results)

In [None]:
log_loss(y_dev_full,y_dev_results)

In [None]:
X_test_full,vocab_indices = vectorize_ngram(ngram_test,new_model)

In [None]:
# def predict_full(filename,vocab,set_vocab,new_model,classifier):
#     #get the pairs of vocab that appear in the test file
#     vocab_pairs = ['{'+ x[0]+'|'+x[1]+'}' for x in vocab]
#     #split the file to word arrays
#     test_data=file_to_split_sentences(filename,5)
#     #preprocess to add UNK token 
#     modified_test_data= vocab_preprocess(test_data,set_vocab,vocab_pairs)
#     split_test_vocab = split_test_by_vocab(modified_test_data,vocab_pairs)
#     for x in range(0,len(vocab_pairs)):
#         ngram_test= test_to_ngram(split_test_vocab[x],vocab_pairs[x],5)
#         X_test_single = test_ngram_to_vec(ngram_test,new_model)
#         y_pred = classifier.predict_proba(X_test_single)
#         #evaluate performance using log_loss
#         print('Test loss: ',log_loss(y_train,y_train_pred),' Val loss: ',log_loss(y_dev,y_pred))

In [None]:
def predict_full(X_test_full,vocab_indices,models):
    results = ['Id,Expected']
    for index,line in enumerate(X_test_full):
        clf = pickle.loads(models[vocab_indices[index]])
        results.append(str(index+1) + "," + str(clf.predict_proba(np.reshape(line, (1,-1)))[0][1]))
    return results    

In [None]:
y_results = predict_full(X_test_full,vocab_indices,models)

In [None]:
y_results[0:10]

In [None]:
def write_to_file(filename, results):
    out_file = open(filename,'w')
    for line in results:
        out_file.write(line+'\n')
    out_file.close()
    return 1

In [None]:
write_to_file('linearregCV.csv',y_results)