# Word2Vec in Caffe - Continuous Bag of Words 

## Pre-train a word embedding layer for use in an AES

In [3]:
import os
from nltk.tokenize import word_tokenize     # tokenise words
from nltk.corpus import stopwords, wordnet  # obtain English stopwords and small dictionary
from nltk.probability import FreqDist       # get frequency distribution of tokens
# from nltk.stem import PorterStemmer         # get stems of words
import string
import numpy as np
import h5py
from keras.preprocessing.sequence import skipgrams

### Open Hewlitt Packard Kaggle AES Competition Data - Essay Set 6

In [4]:
wordlist = {w for w in wordnet.words() if w.lower()} # small dictionary of words
stop_words = stopwords.words("english")  # load stopwords

f = open('train_data.txt','r')
trn_data = f.readlines()
f.close()

f = open('train_labs.txt','r')
trn_labels = f.readlines()
f.close()

f = open('test_data.txt','r')
tst_data = f.readlines()
f.close()

f = open('test_labs.txt','r')
tst_labels = f.readlines()
f.close()

trn_labs = []
tst_labs = []

for lab in trn_labels:
    trn_labs.append(int(lab.split('\n')[0]))
    
for labs in tst_labels:
    tst_labs.append(int(labs.split('\n')[0]))

del lab, labs, trn_labels, tst_labels

all_data = trn_data + tst_data

FileNotFoundError: [Errno 2] No such file or directory: 'train_data.txt'

### Conduct text preprocessing using the "nltk" package of Python

In [14]:
corpus = []

for file in all_data:
    text = file.lower()
    tokens = word_tokenize(text)
    cln_txt = filter(lambda x: x not in stop_words, tokens)          # remove stop words
    cln_txt = filter(lambda x: x not in string.punctuation, tokens) # remove punctuation
    cln_txt = list(filter(lambda x: x in wordlist, cln_txt))         # remove mispelt words
    cln_txt = [item for item in cln_txt if item.isalpha()]           # remove numerical characters
    cln_txt = [word for word in cln_txt if len(word) > 2]            # remove words less than 2 letters long
    corpus.extend(cln_txt)

freq_dist = FreqDist(corpus)

### Create vocabulary

In [15]:
commonwords = freq_dist.most_common(3000)       # Identify N most common words

common = commonwords[0:700]                     # Select M words that occur less than a certain frequency

common = [i[0] for i in common]                 # Convert list of tuples to list of words

vocab = sorted(list(set(common)))

vocab.extend(['AAPAD']) # document padding tag

#vocab.extend(['UNK']) # unknown word tag

vocab = sorted(vocab)

word_to_idx = { word: idx for idx, word in enumerate(vocab) } # vocab dict for corpus

### Create training dataset for AES

In [16]:
data_trn = []

for file in trn_data:
    text = file.lower()
    tokens = word_tokenize(text)
    cln_txt = filter(lambda x: x not in string.punctuation, tokens)
    cln_txt = filter(lambda x: x not in stop_words, cln_txt)
    cln_txt = list(filter(lambda x: x in wordlist, cln_txt))
    cln_txt = [item for item in cln_txt if item.isalpha()]
    cln_txt = list(filter(lambda x: x in common, cln_txt))
    #cln_txt = [word if word in common else 'UNK' for word in cln_txt]
    cln_txt = [word for word in cln_txt if len(word) > 2]
    w = []
    for word in cln_txt:
        val = word_to_idx[word]
        w.append(val)
    data_trn.append(w)

lendoc_trn = []

for doc in data_trn:
    ldoc = len(doc)
    lendoc_trn.append(ldoc)

max_doc_len_trn = np.max(lendoc_trn)

store_mat_trn = np.zeros((len(data_trn),max_doc_len_trn), dtype=np.float32)

for i in range(len(data_trn)):
    padarray = np.asarray(data_trn[i])
    padarray = np.pad(padarray,(0,max_doc_len_trn - len(data_trn[i])), mode='constant')
    store_mat_trn[i,:] = padarray

scores_trn = np.asarray(trn_labs, dtype=np.float32)

hdf_trn_file = "trn_aes.hdf5"
hdf_list_trn_file = "trn_aes_hdf5_list.txt"

with h5py.File(hdf_trn_file, "w") as f:
    #Create dataset
    f.create_dataset("data", data=store_mat_trn)
    f.create_dataset("label", data=scores_trn)
    f.close()
with open(hdf_list_trn_file, "w") as f:
    f.write(hdf_trn_file)
    f.close()

In [17]:
store_mat_trn.shape

(1500, 161)

### Create test set for AES

In [7]:
data_tst = []

for file in tst_data:
    text = file.lower()
    tokens = word_tokenize(text)
    cln_txt = filter(lambda x: x not in string.punctuation, tokens)
    cln_txt = filter(lambda x: x not in stop_words, cln_txt)
    cln_txt = list(filter(lambda x: x in wordlist, cln_txt))
    cln_txt = [item for item in cln_txt if item.isalpha()]
    cln_txt = list(filter(lambda x: x in common, cln_txt))
    #cln_txt = [word if word in common else 'UNK' for word in cln_txt]
    cln_txt = [word for word in cln_txt if len(word) > 2]
    w = []
    for word in cln_txt:
        val = word_to_idx[word]
        w.append(val)
    data_tst.append(w)

lendoc_tst = []

for doc in data_tst:
    ldoc = len(doc)
    lendoc_tst.append(ldoc)

max_doc_len_tst = np.max(lendoc_tst)

store_mat_tst = np.zeros((len(data_tst),max_doc_len_trn), dtype=np.float32)

for i in range(len(data_tst)):
    padarray = np.asarray(data_tst[i])
    padarray = np.pad(padarray,(0,max_doc_len_trn - len(data_tst[i])), mode='constant')
    store_mat_tst[i,:] = padarray
    
scores_tst = np.asarray(tst_labs, dtype=np.float32)

hdf_tst_file = "tst_aes.hdf5"
hdf_list_tst_file = "tst_aes_hdf5_list.txt"

with h5py.File(hdf_tst_file, "w") as f:
    #Create dataset
    f.create_dataset("data", data=store_mat_tst)
    f.create_dataset("label", data=scores_tst)
    f.close()

with open(hdf_list_tst_file, "w") as f:
    f.write(hdf_tst_file)
    f.close()

In [8]:
store_mat_tst.shape

(300, 161)

### Create CBoW Training Set and hdf5 databases

In [None]:
Caffe_trn = list(store_mat_trn)

CaffeCor2Vec = [item for sublist in Caffe_trn for item in sublist]

dat = []
lab = []
for i in range(2, len(CaffeCor2Vec) - 2):
    context = [CaffeCor2Vec[i - 2], CaffeCor2Vec[i - 1], CaffeCor2Vec[i + 1], CaffeCor2Vec[i + 2]]
    target = CaffeCor2Vec[i]
    dat.append(context)
    lab.append(target)

dat_np = np.asarray(dat)

lab_np = np.asarray(lab)

hdf_trn_file = "trn_caffewrd2vec.hdf5"
hdf_list_trn_file = "trn_caffewrd2vec_hdf5_list.txt"

with h5py.File(hdf_trn_file, "w") as f:
    #Create dataset
    f.create_dataset("data", data=dat_np)
    f.create_dataset("label", data=lab_np)
    f.close()

with open(hdf_list_trn_file, "w") as f:
    f.write(hdf_trn_file)
    f.close()

### Create skip-gram training set and hdf5 databases

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
