# Train embeddings on single documents 

In [1]:
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from functools import reduce
import os
from os.path import basename
import csv
TEXT_DATA_DIR='SingleDocSignals'
sentences = []  # list of text articles
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    fpath = os.path.join(TEXT_DATA_DIR, fname)
    f = open(fpath)
    sentences.append(f.read())
    f.close()
    labels_index[basename(fname)] = len(labels_index)
    labels.append(-1)

d = []
for i in sentences:
    words2 = text_to_word_sequence(i, lower=True, split=" ")
    d.append(words2)
    
sentences = d
vocab = sorted(reduce(lambda x, y: x | y, (set(i) for i in d)))

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = 16
X = []
for i in sentences:
    x = [word_idx[w] for w in i]
    X.append(x)

X_train = pad_sequences(X,1000)

#load labels
filePath='SingleDocSignals.csv'
with open(filePath,'r') as intputFile:
        reader=csv.reader(intputFile,delimiter=',')
        for fname,y in reader:
            labels[labels_index[fname+".txt"]]=int(y)

Categories = labels
y = np.zeros(9)
outputs = list(set(Categories))
Y = []
for i in Categories:
    y = np.zeros(9)
    indexV = outputs.index(i)
    y[indexV]=1
    Y.append(y)
Y = np.asarray(Y)

Using TensorFlow backend.


In [2]:
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np

def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features, min_word_count=1, context=10):
    """
    Trains, saves, loads Word2Vec model
    Returns initial weights for embedding layer.
   
    inputs:
    sentence_matrix # int matrix: num_sentences x max_sentence_len
    vocabulary_inv  # dict {str:int}
    num_features    # Word vector dimensionality                      
    min_word_count  # Minimum word count                        
    context         # Context window size 
    """
    model_dir = 'word2vec_models'
    model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
    model_name = join(model_dir, model_name)
    if exists(model_name):
        embedding_model = word2vec.Word2Vec.load(model_name)
        print('Loading existing Word2Vec model \'%s\'' % split(model_name)[-1])
    else:
        # Set values for various parameters
        num_workers = 2       # Number of threads to run in parallel
        downsampling = 1e-3   # Downsample setting for frequent words
        
        # Initialize and train the model
        print("Training Word2Vec model...")
        sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
        embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                            size=num_features, min_count = min_word_count, \
                            window = context, sample = downsampling)
        
        # If we don't plan to train the model any further, calling 
        # init_sims will make the model much more memory-efficient.
        embedding_model.init_sims(replace=True)
        
        # Saving the model for later use. You can load it later using Word2Vec.load()
        if not exists(model_dir):
            os.mkdir(model_dir)
        print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
        embedding_model.save(model_name)
    
    #  add unknown words
    embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
                                                        else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\
                                                        for w in vocabulary_inv])]
    return embedding_weights

# if __name__=='__main__':
#     import data_helpers
#     print("Loading data...")
#     x, _, _, vocabulary_inv = data_helpers.load_data()
#     w = train_word2vec(x, vocabulary_inv)



In [5]:
import numpy as np
# import data_helpers
# from w2v import train_word2vec 
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

np.random.seed(2)

# Parameters
# ==================================================
#
# Model Variations. See Kim Yoon’s Convolutional Neural Networks for 
# Sentence Classification, Section 3 for detail.

model_variation = 'CNN-rand'  #  CNN-rand | CNN-non-static | CNN-static
print('Model variation is %s' % model_variation)

# Model Hyperparameters
sequence_length = 1000
embedding_dim = 100          
filter_sizes = (3, 4)
num_filters = 100
dropout_prob = (0.25, 0.5)
hidden_dims = 100

# Training parameters
batch_size = 5
num_epochs = 5
val_split = 0.33

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count                        
context = 10        # Context window size    
ACTION = "train"
weights_file = "weights_file"
# Data Preparatopn
# ==================================================
#
# Load data
print("Loading data...")
# x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

vocabulary= word_idx
vocabulary_inv = vocab 
vocabulary_inv.append("</PAD>")

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(Y)))
x_shuffled = X_train[shuffle_indices]
y_shuffled = Y[shuffle_indices].argmax(axis=1)
nb_validation_samples = int(val_split * len(x_shuffled))
x_train_only = x_shuffled[:-nb_validation_samples]

if model_variation=='CNN-non-static' or model_variation=='CNN-static':
    embedding_weights = train_word2vec(x_train_only, vocabulary_inv, embedding_dim, min_word_count, context)
    if model_variation=='CNN-static':
        X_train = embedding_weights[0][x_train_only]
elif model_variation=='CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')    



print("Vocabulary Size: {:d}".format(len(vocabulary)))

# find out how vocab is causing problems


# Building model
# ==================================================
#
# graph subnet with one input and one output,
# convolutional layers concateneted in parallel
graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = MaxPooling1D(pool_length=2)(conv)
    flatten = Flatten()(pool)
    convs.append(flatten)

if len(filter_sizes) > 1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)
# main sequential model
model = Sequential()
if not model_variation=='CNN-static':
    model.add(Embedding(len(vocabulary_inv),embedding_dim, input_length=sequence_length,
                        weights=embedding_weights))

model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dense(hidden_dims))
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(9))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['accuracy'])
#opt = SGD(lr=0.01, momentum=0.80, decay=1e-6, nesterov=True)
#model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

# Training the model
if ACTION == "predict" and os.path.exists(weights_file):
        model.load_weights(weights_file)
else:
    #model.fit(x_shuffled, y_shuffled, batch_size=batch_size,
     #         nb_epoch=num_epochs, validation_split=val_split, verbose=2)
    model.fit(X_train, Y, batch_size=batch_size,nb_epoch=num_epochs, validation_split=val_split)
    #print("dumping weights to file...")
    #model.save_weights(weights_file, overwrite=True)
# Training model
# ==================================================
#model.fit(X_train, Y, batch_size=batch_size,nb_epoch=num_epochs)


Model variation is CNN-rand
Loading data...
Vocabulary Size: 67890
Train on 229 samples, validate on 113 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Analyze Word-Embeddings

1- All Documents

In [9]:
import os
from gensim.models import word2vec
import data_helpers
import numpy as np 
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=300
min_word_count=5
context=10

TEXT_DATA_DIR='SingleDocSignals'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    fpath = os.path.join(TEXT_DATA_DIR, fname)
    f = open(fpath)
    sentences.append(f.read())
    f.close()
    

sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))
# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)
 
# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
words=['book','man','woman','we','they']
for word in words:
    print ('Most similar to %s:' % word)
    sim=embedding_model.most_similar(word,topn=10)
    for w_sim in sim:
        print (w_sim)
print ("Distance between 'us' and 'them':", (1-embedding_model.wv.similarity('us', 'them')))
print ("Distance between 'our' and 'their':", (1-embedding_model.wv.similarity('our', 'their')))
print ("Distance between 'we' and 'they':", (1-embedding_model.wv.similarity('we', 'they')))
print ("Distance between 'i' and 'they':", (1-embedding_model.wv.similarity('i', 'they')))

Found 342 texts.
Training Word2Vec model...
Most similar to book:
('beginning', 0.8126802444458008)
('word', 0.7602472305297852)
('view', 0.757227897644043)
('kingdom', 0.7565963268280029)
('passage', 0.7553075551986694)
('heat', 0.7506297826766968)
('story', 0.7482398748397827)
('torah', 0.7370935678482056)
("ta'an", 0.7333469986915588)
('revelation', 0.7302749156951904)
Most similar to man:
('woman', 0.7968037128448486)
('brother', 0.685215950012207)
('wife', 0.683623194694519)
('blessing', 0.6799474954605103)
('person', 0.6795306205749512)
('he', 0.6542274951934814)
('child', 0.6527204513549805)
('poor', 0.6390407085418701)
('whosoever', 0.6388231515884399)
('abraham', 0.6291402578353882)
Most similar to woman:
('child', 0.8534461259841919)
('man', 0.7968037128448486)
('daughter', 0.7621140480041504)
('baby', 0.7436813116073608)
('wife', 0.7347227334976196)
('legion', 0.7328832149505615)
('palace', 0.7258248329162598)
('stranger', 0.7244707942008972)
('husband', 0.722589910030365)
(

In [12]:
from gensim.models import Word2Vec
model_name='GoogleNews-vectors-negative300.bin'
embedding_model = Word2Vec.load_word2vec_format(model_name, binary=True)
print ("Distance between 'us' and 'them':", (1-embedding_model.wv.similarity('us', 'them')))
print ("Distance between 'our' and 'their':", (1-embedding_model.wv.similarity('our', 'their')))
print ("Distance between 'we' and 'they':", (1-embedding_model.wv.similarity('we', 'they')))
print ("Distance between 'i' and 'they':", (1-embedding_model.wv.similarity('i', 'they')))

Distance between 'us' and 'them': 0.29255531341
Distance between 'our' and 'their': 0.482712232815
Distance between 'we' and 'they': 0.351189502355
Distance between 'i' and 'they': 0.786774077005


1- Liberal Judaism

In [10]:
import os
from gensim.models import word2vec
import data_helpers
import numpy as np 
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=50
min_word_count=5
context=4

TEXT_DATA_DIR='SingleDocSignals.LiberalJudaism'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    fpath = os.path.join(TEXT_DATA_DIR, fname)
    f = open(fpath)
    sentences.append(f.read())
    f.close()
    

sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))
# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)
 
# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
words=['book','man','woman' ,'we','they']
for word in words:
    print ('Most similar to %s:' % word)
    if word in vocabulary:
        sim=embedding_model.most_similar(word,topn=10)
        for w_sim in sim:
            print (w_sim)
    else:
        print ("Not found.")

Found 34 texts.
Training Word2Vec model...
Most similar to book:
('being', 0.999659538269043)
('towards', 0.9996157884597778)
('take', 0.9996099472045898)
('also', 0.9996005296707153)
('human', 0.9995778799057007)
('makes', 0.9995735883712769)
('torah', 0.9995177388191223)
('clear', 0.9995163679122925)
('within', 0.9994983673095703)
('more', 0.9994863867759705)
Most similar to man:
('an', 0.9995979070663452)
('when', 0.9994972944259644)
('about', 0.9994536638259888)
('or', 0.9994368553161621)
('life', 0.9993894696235657)
('even', 0.9993335604667664)
('future', 0.99932461977005)
('which', 0.9993187189102173)
('humanity', 0.9993013739585876)
('person', 0.9992953538894653)
Most similar to woman:
('behalf', 0.9969239234924316)
('person', 0.9967086315155029)
('just', 0.9966903328895569)
('would', 0.9964863061904907)
('there', 0.9964463710784912)
('or', 0.996443510055542)
('team', 0.9964284896850586)
('why', 0.9964030981063843)
('very', 0.99639892578125)
('they', 0.9963756203651428)
Most sim

2- ISIS

In [6]:
import os
from gensim.models import word2vec
import data_helpers
import numpy as np 
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=50
min_word_count=5
context=4

TEXT_DATA_DIR='SingleDocSignals.ISIS'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    fpath = os.path.join(TEXT_DATA_DIR, fname)
    f = open(fpath)
    sentences.append(f.read())
    f.close()
    

sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))
# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)
 
# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
words=['book','man','we','they']
for word in words:
    print ('Most similar to %s:' % word)
    sim=embedding_model.most_similar(word,topn=10)
    for w_sim in sim:
        print (w_sim)

Found 11 texts.
Training Word2Vec model...
Most similar to book:
('ta', 0.9889437556266785)
('follow', 0.9885311126708984)
('make', 0.9885040521621704)
('one', 0.988458514213562)
('has', 0.9884455800056458)
('no', 0.9884402751922607)
('whose', 0.9883753061294556)
('us', 0.9883221983909607)
('if', 0.9882023930549622)
('them', 0.9881460666656494)
Most similar to man:
('ummah', 0.9987025260925293)
('allah', 0.998604953289032)
('she', 0.9985135197639465)
('within', 0.9984798431396484)
('jihad', 0.9983980655670166)
('by', 0.9983532428741455)
('towards', 0.9983308911323547)
('islam', 0.9983193278312683)
('may', 0.998298168182373)
('position', 0.9982853531837463)
Most similar to we:
('page', 0.9995814561843872)
('at', 0.999549388885498)
('you', 0.999499499797821)
('l', 0.999442994594574)
('have', 0.9993689060211182)
('he', 0.9993428587913513)
('my', 0.9993402361869812)
('other', 0.9993234872817993)
('were', 0.9991870522499084)
('hid', 0.9989795684814453)
Most similar to they:
('one', 0.999672

3- Dorothy Day

In [7]:
import os
from gensim.models import word2vec
import data_helpers
import numpy as np 
# Set values for various parameters
num_workers = 8       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features=50
min_word_count=5
context=4

TEXT_DATA_DIR='SingleDocSignals.DorothyDay'
sentences = []  # list of text articles
for fname in sorted(os.listdir(TEXT_DATA_DIR)):
    fpath = os.path.join(TEXT_DATA_DIR, fname)
    f = open(fpath)
    sentences.append(f.read())
    f.close()
    

sentences = [s.strip() for s in sentences]
x_text = [data_helpers.clean_str(sent) for sent in sentences]
x_text = [s.split(" ") for s in x_text]
    
print('Found %s texts.' % len(x_text))
# Initialize and train the model
print("Training Word2Vec model...")
x,  vocabulary, vocabulary_inv =data_helpers.load_data(x_text)
sentences = [[vocabulary_inv[w] for w in s] for s in x]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)
 
# If we don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
words=['book','man','woman','we','they']
for word in words:
    print ('Most similar to %s:' % word)
    sim=embedding_model.most_similar(word,topn=10)
    for w_sim in sim:
        print (w_sim)

Found 18 texts.
Training Word2Vec model...
Most similar to book:
('when', 0.9988095164299011)
('on', 0.9987618923187256)
('had', 0.9987297654151917)
('our', 0.9986756443977356)
('baby', 0.9986304044723511)
('was', 0.9986129403114319)
('place', 0.9985895752906799)
('many', 0.9985389709472656)
('a', 0.998504638671875)
('three', 0.9984842538833618)
Most similar to man:
('are', 0.9988906383514404)
('is', 0.9986475706100464)
('remember', 0.998629629611969)
('through', 0.9983686208724976)
('to', 0.9980978965759277)
('without', 0.9980942010879517)
('worker', 0.9978844523429871)
('let', 0.9976675510406494)
('own', 0.9975814819335938)
('any', 0.9974715709686279)
Most similar to woman:
('asked', 0.9927756786346436)
('battle', 0.9917311072349548)
('faith', 0.991574764251709)
('best', 0.9906504154205322)
('without', 0.9905869364738464)
('dead', 0.9903661012649536)
('religion', 0.9902663230895996)
('each', 0.9902605414390564)
('kitchen', 0.9901933073997498)
('apostolate', 0.9900656938552856)
Most s