In [None]:
import math
import pandas as pd
import numpy as np
import re
import time
import unicodedata as ud
import csv
import random
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from gensim.models import Word2Vec

# Import training data for vectorizing

In [None]:
def prepare_vec_data(filename, N):
    pre_string = "<s>"
    post_string = "</s>"
    train_dat = []
        
    with open(filename, encoding="utf8") as file:
        for line in file:
            line = ud.normalize("NFC",line)
            line = re.sub('[,.?"“”]','',line)
            line = re.sub('\s+',' ',line)
            line = line.lower()
            split_line = line.strip().split()
            for gram in range(1,N):
                split_line.insert(0,pre_string)
                split_line.append(post_string)
            train_dat.append(split_line)
    return train_dat

In [None]:
# def prepare_vec_data(filename):
#     train_dat = []
#     with open(filename, encoding="utf8") as file:
#         for line in file:
#             line = ud.normalize("NFC",line)
#             line = re.sub('[,.?"“”]','',line)
#             line = re.sub('\s+',' ',line)
#             split_line = line.strip().split()
#             train_dat.append(split_line)
#     return train_dat

In [None]:
vec_train= prepare_vec_data('train.txt',5)

#### Train Word2Vec model

In [None]:
model=Word2Vec(vec_train, size = 200, max_final_vocab=10000)

In [None]:
print(model)

In [None]:
set_vocab = model.wv.vocab

In [None]:
def vocab_preprocess(vec_train,set_vocab):
    new_train_vec=[]
    for line in vec_train:
        new_line=[]
        for word in line:
            if word in set_vocab:
                new_line.append(word)
            else:
                new_line.append('<UNK>')
        new_train_vec.append(new_line)
    return new_train_vec

In [None]:
new_train_vec = vocab_preprocess(vec_train,set_vocab)

In [None]:
new_train_vec[0]

In [None]:
new_model=Word2Vec(new_train_vec, size = 200, min_count=1)

In [None]:
print(new_model)

In [None]:
model.save('model.bin')

### Split the data by vocab

In [None]:
# function to determine which vocab word an index is in the training data
def which_vocab(index):
    if index < 24200:
        return 0, 24200
    elif index < 48400:
        return 1, 24200
    elif index < 51290:
        return 2, 2890
    elif index < 75490:
        return 3, 24200
    elif index < 99690:
        return 4, 24200
    elif index < 123890:
        return 5, 24200
    elif index < 131159:
        return 6, 7269
    elif index < 155359:
        return 7, 24200
    elif index < 179559:
        return 8, 24200
    elif index < 203759:
        return 9, 24200
    elif index < 227959:
        return 10, 24200
    elif index < 252159:
        return 11, 24200
    elif index < 258227:
        return 12, 6068
    elif index < 282427:
        return 13, 24200
    elif index < 306627:
        return 14, 24200
    elif index < 310023:
        return 15, 3396
    elif index < 334223:
        return 16, 24200
    elif index < 358423:
        return 17, 24200
    elif index < 382623:
        return 18, 24200
    elif index < 406823:
        return 19, 24200
    elif index < 418928:
        return 20, 12105
    elif index < 430425:
        return 21, 11497
    elif index < 446988:
        return 22, 16563
    elif index < 452037:
        return 23, 5049
    elif index < 456571:
        return 24, 4534

#### Import Vocab

In [None]:
def create_vocab():
    with open('vocab.csv') as file:
        reader = csv.reader(file)
        vocab = list(reader)
    return vocab

In [None]:
vocab = create_vocab()

In [None]:
def split_by_vocab(train_array):
    vocab_train_array = []
    count = 1
    vocab_word = 0
    temp_train=[]
    for index,training_line in enumerate(train_array):
        new_vocab_word, vocab_count = which_vocab(index)
        if new_vocab_word != vocab_word:
            vocab_train_array.append(temp_train)
            temp_train = []
            vocab_word=new_vocab_word
        temp_train.append(training_line)
    vocab_train_array.append(temp_train)
    return vocab_train_array

In [None]:
#check for correct split
split_vocab_data = split_by_vocab(new_train_vec)
for x,array in enumerate(split_vocab_data):
    print(vocab[x], len(array))

In [None]:
def train_to_labeled(single_vocab_array, vocab, N):
    labeled_train = []
    for index,training_line in enumerate(single_vocab_array):
        label = -1
        target_word = -1
        if vocab[0] in training_line:
            label = 1
            target_word = training_line.index(vocab[0])
        elif vocab[1] in training_line:
            label = 0
            target_word = training_line.index(vocab[1])
            
        if label==-1 or target_word ==-1:
            print('error, desired vocab not found')
            return 0
        
        # set number of previous and following words to capture
        max_previous = target_word - math.ceil(N/2)
        max_forward = target_word + math.floor(N/2)
        ngram = [training_line[x] for x in range(max_previous,max_forward) if x != target_word]
        ngram.append(label)
        labeled_train.append(ngram)
    
    return labeled_train

In [None]:
labeled_data = train_to_labeled(split_vocab_data[0],vocab[0],7)

In [None]:
labeled_data[1]

In [None]:
def vectorize_ngram(labeled_data, word2vec_model):
    vectorized_list=[]
    labels=[]
    for line in labeled_data:
        label = line[-1]
        vectorized_data = []
        for x in line[0:-1]:
            if x in word2vec_model.wv:
                vectorized_data.extend(word2vec_model.wv[x])
            else:
                vectorized_data.extend(np.zeros(model.trainables.layer1_size))
#         vectorized_data = [word2vec_model[x] for x in line[0:-1] if x in word2vec_model.wv]
#         vectorized_data = np.matrix(vectorized_data)
        #vectorized_data.append(label)
        vectorized_list.append(vectorized_data)
        labels.append(label)
    return (vectorized_list,labels)

In [None]:
X,y = vectorize_ngram(labeled_data,new_model)

In [None]:
print(X[1],y[1])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
print(len(X),len(X_train),len(X_test))


In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=1.0,random_state=0, tol=1e-5)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
log_loss(y_test,y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict_proba(X_test)

In [None]:
print(log_loss(y_test,y_pred))
print (y_test[0],y_pred[0])

In [None]:
from sklearn.svm import LinearSVR
svm_reg = LinearSVR(random_state=0, max_iter=50)
svm_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_reg.predict_proba(X_test_scaled)
log_loss(y_test,y_pred)

In [None]:
y_pred[0:100]

In [None]:
from sklearn.svm import SVR
svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1, gamma="auto")
svm_poly_reg.fit(X_train, y_train)

In [None]:
y_pred = svm_poly_reg.predict(X_test)
log_loss(y_test,y_pred)

In [None]:
def train_all(split_vocab_data,model):
    y_predictions = []
    y_tests = []
    models=[]
    for index,vocab_example in enumerate(split_vocab_data):
        labeled_data = train_to_labeled(vocab_example,vocab[index],5)
        X,y = vectorize_ngram(labeled_data,model)
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
        from sklearn.linear_model import LogisticRegression
        clf = LogisticRegression(C=0.0005,random_state=0, solver='lbfgs',max_iter=2000).fit(X_train, y_train)
        y_pred=clf.predict_proba(X_test)
        models.append(clf)

        print(log_loss(y_test,y_pred))
        y_predictions.extend(y_pred)
        y_tests.extend(y_test)
    print(log_loss(y_tests,y_predictions))
    return (y_tests,y_predictions,models)

In [None]:
y_tests,y_pred,models=train_all(split_vocab_data,new_model)

In [None]:
print(len(y_tests))
print(y_pred[91000:91300])
log_loss(y_tests,y_pred)

In [None]:
from sklearn import linear_model
clf = linear_model.SGDRegressor(epsilon=5,max_iter=1000, tol=1e-3, random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
log_loss(y_test,y_pred)

In [None]:
y_pred[0:1000]

In [None]:
y_train[0:100]

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train, y_train)


In [None]:
y_pred=clf.predict_proba(X_test)

In [None]:
y_pred[0:10]

In [None]:
log_loss(y_test,predict_y)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial').fit(X_train, y_train)

In [None]:
y_pred=clf.predict_proba(X_test)
y_pred[0:10]

In [None]:
log_loss(y_test,predict_y)

In [None]:
test_data=prepare_vec_data('test.txt',5)

In [None]:
def test_vocab_preprocess(vec_train,set_vocab,vocab_pairs):
    new_train_vec=[]
    for line in vec_train:
        new_line=[]
        for word in line:
            if word in set_vocab or word in vocab_pairs:
                new_line.append(word)
            else:
                new_line.append('<UNK>')
        new_train_vec.append(new_line)
    return new_train_vec

In [None]:
modified_test_data= test_vocab_preprocess(test_data,set_vocab,vocab_pairs)

In [None]:
#model.train(test_data,total_examples=model.corpus_count, epochs=100)

In [None]:
modified_test_data[1]

In [None]:
vocab_pairs = ['{'+ x[0]+'|'+x[1]+'}' for x in vocab]

In [None]:
def test_to_ngram(test_array,vocab_pairs,N):
    ngram_test = []
    target_word = -1
    vocab_index = -1
    for test_example in test_array:
        for x in vocab_pairs:
            if x in test_example:
                target_word=test_example.index(x)
                vocab_index = vocab_pairs.index(x)
                break

        if target_word ==-1:
                print('error, desired vocab not found')
                return 0

        # set number of previous and following words to capture
        max_previous = target_word - math.ceil(N/2)
        max_forward = target_word + math.floor(N/2)
        ngram = [test_example[x] for x in range(max_previous,max_forward) if x != target_word]
        ngram.append(vocab_index)
        ngram_test.append(ngram)
    
    return ngram_test

In [None]:
ngram_test = test_to_ngram(test_data,vocab_pairs,5)

In [None]:
ngram_test[0]

In [None]:
X_test_full,vocab_indices = vectorize_ngram(ngram_test,new_model)

In [None]:
len(X_test_full[1])

In [None]:
vocab_indices

In [None]:
def predict_full(X_test_full,vocab_indices,models):
    results = ['Id,Expected']
    for index,line in enumerate(X_test_full):
        results.append(str(index+1) + "," + str(models[vocab_indices[index]].predict_proba(np.reshape(line, (1,-1)))[0][0]))
    return results    

In [None]:
y_results = predict_full(X_test_full,vocab_indices,models)

In [None]:
y_results[0:100]

In [None]:
def write_to_file(filename, results):
    out_file = open(filename,'w')
    for line in results:
        out_file.write(line+'\n')
    out_file.close()
    return 1

In [None]:
write_to_file('linreg_with_word2vec_unk_c0001.csv',y_results)