In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# define data locations

data_dir = 'Gutenberg_English_Fiction_1k/'
target_file = 'master996.csv'
corpus_dir = 'Gutenberg_19th_century_English_Fiction/'

In [4]:
# import target

data = pd.read_csv(data_dir + target_file, sep=';', engine='python')
data.loc[:]['book_id'] = data['book_id'].apply(lambda book_id: book_id[:-5]) # remove '.epub' ending
data.set_index('book_id', inplace=True)
data

Unnamed: 0_level_0,Book_Name,guten_genre,Author_Name
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pg10067,The Mystery of the Boule Cabinet: A Detective ...,Detective and Mystery,Stevenson| Burton Egbert
pg1032,The Pupil,Literary,James| Henry
pg10379,At Love's Cost,Literary,Garvice| Charles
pg10473,The Heart of the Range,Western Stories,White| William Patterson
pg10812,The Worshipper of the Image,Literary,Gallienne| Richard Le
pg10826,The Book-Bills of Narcissus - An Account Rende...,Literary,Gallienne| Richard Le
pg10943,Elusive Isabel,Detective and Mystery,Futrelle| Jacques
pg11004,Joe Burke's Last Stand,Literary,Wetterau| John Moncure
pg11005,O+F,Literary,Wetterau| John Moncure
pg11259,Polly and the Princess,Literary,Dowd| Emma C.


In [5]:
# import corpus

def get_book_content(book_id):      
    filename = data_dir + corpus_dir + book_id + '-content.html'
    
    with open(filename, encoding='utf-8') as file:        
        try:
            content = file.read()
            
        except UnicodeDecodeError:
            print('UnicodeDecodeError trying to read {}. Returning None.'.format(book_id))
            return None
        
        content = content.replace('<p>','')   
        
    return content

data['content'] = [get_book_content(book_id) for book_id in data.index]
data

Unnamed: 0_level_0,Book_Name,guten_genre,Author_Name,content
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pg10067,The Mystery of the Boule Cabinet: A Detective ...,Detective and Mystery,Stevenson| Burton Egbert,A Detective Story\nA.B.M. Fellow-Sherlockian\n...
pg1032,The Pupil,Literary,James| Henry,This edition first published 1916\nThe text fo...
pg10379,At Love's Cost,Literary,Garvice| Charles,"""Until this moment I have never fully realised..."
pg10473,The Heart of the Range,Western Stories,White| William Patterson,"""The Rider of Golden Bar,"" ""Hidden Trails,"" ""L..."
pg10812,The Worshipper of the Image,Literary,Gallienne| Richard Le,The Worshipper of the Image\nEvening was in th...
pg10826,The Book-Bills of Narcissus - An Account Rende...,Literary,Gallienne| Richard Le,"December 6th, 1894.\nNOTE. - This third editio..."
pg10943,Elusive Isabel,Detective and Mystery,Futrelle| Jacques,All the world rubs elbows in Washington. Outwa...
pg11004,Joe Burke's Last Stand,Literary,Wetterau| John Moncure,Joe Burke's Last Stand\nEvery Story Is A Love ...
pg11005,O+F,Literary,Wetterau| John Moncure,John Moncure Wetterau\nLibrary of Congress Num...
pg11259,Polly and the Princess,Literary,Dowd| Emma C.,The June Holiday Home was one of those sumptuo...


In [6]:
#to faster debug, i use a smaller subset of the texts right now:
data_content = data['content'][:1]

In [7]:
import maPrepro
data_content_filtered = maPrepro.prepare_texts(data_content, use_stemming=True)
data_content_filtered

Text Count:  1  Progress: 
0

[nltk_data] Downloading package stopwords to /Users/nani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/nani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


["A Detective Story A.B.M Fellow-Sherlockian '' Hello '' I said I took receiver desk 'phone answer call `` Mr. Vantine wish speak sir '' said office-boy `` All right '' I heard snap connection `` Is Lester '' asked Philip Vantine 's voice `` Yes So 're back '' `` Got yesterday Can come house lunch to-day '' `` I 'll glad '' I said meant I liked Philip Vantine `` I 'll look one-thirty '' And happened hour later I walking toward Washington Square Avenue old Vantine mansion stood It almost last survival old régime tide business long since overflowed neighbouring street Avenue swept fashionable folk far uptown Tall office loft building replaced brownstone house old family hold like sullen desperate rear-guard defying advancing enemy Philip Vantine one He born house still lived declared would die He one please matter since unmarried lived alone mitigated increasing roar dust neighbourhood long absence abroad It one returned I may well complete pencil-sketch Vantine fifty year age possessor 

In [7]:
#--------------------------
#Tokenization to integers:

In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
#from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=None, #max number of words, only the X most relevant words will be taken
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_``´´{|}~\t\n', #signs that are filtered out
    lower=True, #when true, everything will be converted tp lower case
    split=" ", #word seperator
    char_level=False, #if true, every char will be treated as token
    oov_token=None, #out-of-vocabulary replacement
)

In [9]:
tokenizer.fit_on_texts(data_content_filtered) #tokenizer gets fit to our texts

In [10]:
sequences = tokenizer.texts_to_sequences(data_content[0]) 
sequencesFiltered = tokenizer.texts_to_sequences(data_content_filtered[0]) 
#return a list of tokens for the X (see num_words) most common words the tokenizer knows, for every input text
print("unfiltered length: ", len(sequences), " filtered length: ", len(sequencesFiltered))

unfiltered length:  371350  filtered length:  245848


In [12]:
#from imblearn.under_sampling import RandomUnderSampler

In [12]:
#Feature Extraction with word2vec using CBOW
#Build vocabulary
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data_content_filtered]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:100])

Vocabulary Size: 4958
Vocabulary Sample: [('i', 1), ("''", 2), ('said', 3), ("'s", 4), ("n't", 5), ('godfrey', 6), ('one', 7), ('he', 8), ('it', 9), ('vantine', 10), ('would', 11), ('cabinet', 12), ('but', 13), ('the', 14), ('man', 15), ('and', 16), ('mr', 17), ('see', 18), ('know', 19), ('grady', 20), ('could', 21), ('hand', 22), ('sir', 23), ('lester', 24), ('back', 25), ('little', 26), ('you', 27), ('room', 28), ('drawer', 29), ("'ll", 30), ('parks', 31), ('simmonds', 32), ('think', 33), ('door', 34), ('yes', 35), ('eye', 36), ('right', 37), ('get', 38), ('m', 39), ('well', 40), ('face', 41), ('way', 42), ('that', 43), ('u', 44), ('there', 45), ('two', 46), ('then', 47), ('time', 48), ('came', 49), ('saw', 50), ('thought', 51), ('got', 52), ('perhaps', 53), ('away', 54), ('armand', 55), ('come', 56), ('thing', 57), ('tell', 58), ('rogers', 59), ('what', 60), ('asked', 61), ('added', 62), ('looked', 63), ('no', 64), ('pigot', 65), ('we', 66), ('like', 67), ('look', 68), ('knew', 69),

In [13]:
#Build (context_words, target_word) pair generator
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [14]:
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['a', 'detective', 'a', 'b'] -> Target (Y): story
Context (X): ['detective', 'story', 'b', 'm'] -> Target (Y): a
Context (X): ['story', 'a', 'm', 'fellow'] -> Target (Y): b
Context (X): ['a', 'b', 'fellow', 'sherlockian'] -> Target (Y): m
Context (X): ['b', 'm', 'sherlockian', "''"] -> Target (Y): fellow
Context (X): ['m', 'fellow', "''", 'hello'] -> Target (Y): sherlockian
Context (X): ['fellow', 'sherlockian', 'hello', "''"] -> Target (Y): ''
Context (X): ['sherlockian', "''", "''", 'i'] -> Target (Y): hello
Context (X): ["''", 'hello', 'i', 'said'] -> Target (Y): ''
Context (X): ['hello', "''", 'said', 'i'] -> Target (Y): i
Context (X): ["''", 'i', 'i', 'took'] -> Target (Y): said


In [15]:
#Build CBOW Deep Network Model
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            495800    
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 4958)              500758    
Total params: 996,558
Trainable params: 996,558
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
#for simplicity train model for 1 epoch
for epoch in range(1, 2):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 374939.23200214654



In [19]:
#Get word embeddings
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(4957, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
'',0.045854,-0.165436,-0.207914,0.32677,0.001337,0.147286,0.500185,-0.093851,-0.337858,-0.346954,...,-0.042295,0.5345,-0.045093,0.057744,-0.032164,-0.512889,-0.273319,-0.406446,-0.466524,-0.027504
said,0.266274,0.060354,-0.050025,0.173057,0.024486,0.088786,-0.126871,-0.169511,0.05548,0.218774,...,-0.115254,0.358291,0.208316,-0.460192,0.598387,-0.325481,0.031821,-0.09958,0.179423,-0.122406
's,0.039222,-0.081422,-0.083696,-0.389515,0.083708,-0.398028,-0.004899,0.001924,0.110763,0.080004,...,-0.058775,0.06312,0.254512,-0.150308,-0.208595,0.037217,0.186677,0.433093,-0.146582,-0.043241
n't,-0.135193,0.182729,-0.26923,0.071273,-0.175044,0.200636,-0.056523,-0.184438,0.245725,-0.054974,...,-0.114687,0.009465,0.251469,-0.259385,0.002071,-0.146957,0.209228,0.25068,0.050126,0.060977
godfrey,0.245025,-0.136464,-0.159645,0.041218,-0.114887,-0.06764,-0.081453,-0.177018,0.20375,-0.065737,...,0.047499,0.114484,0.316027,-0.030068,-0.440598,-0.080517,0.023155,0.084812,0.18988,0.146938


In [25]:
#Build a distance matrix to view the most similar words (contextually)
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['house', 'letter', 'eye']}

similar_words

(4957, 4957)


{'house': ['part', 'nature', 'minute', 'kind', 'why'],
 'letter': ['packet', 'present', 'montespan', 'mechanism', 'yet'],
 'eye': ['head', 'staring', 'face', 'forward', 'moment']}