# Applications of Word Embeddings, e.g. for Text Classification

In [1]:
# import some general modules
import logging
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow import keras

# set logging level (suggested: logging.INFO; for bug fixing: logging.DEBUG)
# logging_level = logging.INFO
logging_level = logging.DEBUG

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging_level)

RANDOM_SEED = 3756

# set size of Word Embeddings
EMBEDDING_DIM = 300

## 1. Loading the data 

We will use the course corpus (see Learnweb) for these experiments.

In [2]:
import gzip
import json


def load_data(filename):
    return json.loads(gzip.GzipFile(filename).read().decode('utf-8'))


### Corpus format:

	{
	'corpus creator 1' (string) : texts (list of strings),
	'corpus creator 2' (string) : texts (list of strings),
	...
	}


In [3]:
# !!! you might have to change the path here !!!

# corpus_data_file_path = Path('.') / 'nlpcm_corpus_1.json.gz'
corpus_data_file_path = 'nlpcm_corpus_1.json.gz'

course_corpus = load_data(corpus_data_file_path)

In [4]:

logging.info(' The course corpus consists of %d subcorpora:' % len(course_corpus))
index = 0
for designer, texts in course_corpus.items():
    logging.info(' %d: %6d texts gathered by %s with %d characters in total.' % (index, len(texts), designer, sum([len(text) for text in texts])))
    index += 1


INFO: The course corpus consists of 18 subcorpora:
INFO: 0:   7517 texts gathered by Alexandre Carey with 4802229 characters in total.
INFO: 1:   6585 texts gathered by Eunju Park with 4999802 characters in total.
INFO: 2:    561 texts gathered by Anthony Angrimson with 1500862 characters in total.
INFO: 3:   8241 texts gathered by Paweena Tarepakdee with 4999840 characters in total.
INFO: 4:  11909 texts gathered by Olalekan Olayemi with 4999945 characters in total.
INFO: 5:   5736 texts gathered by Godwin Ezeani with 4266978 characters in total.
INFO: 6:  15368 texts gathered by Atulya Praphul with 4999981 characters in total.
INFO: 7:     27 texts gathered by Muhammad Mehmood Ali with 1276567 characters in total.
INFO: 8:   3921 texts gathered by Haolong Yan with 3942391 characters in total.
INFO: 9:   1041 texts gathered by Patrick Schedlbauer with 4999721 characters in total.
INFO: 10:   9305 texts gathered by Luis Diego Rosello Cordero with 4999833 characters in total.
INFO: 11: 

In [5]:
# tokenize the entire corpus (this may take a few minutes!)
import nltk

sentences = []
index = 0
for designer, texts in course_corpus.items():
    print('Tokenizing text of subcorpus %d of %d' % (index, len(course_corpus)))
    index += 1
    for text in texts:
        for sentence in nltk.sent_tokenize(text, language="english"):
            tokenized_sentence = nltk.word_tokenize(sentence, language="english")
            sentences.append(tokenized_sentence)


Tokenizing text of subcorpus 0 of 18
Tokenizing text of subcorpus 1 of 18
Tokenizing text of subcorpus 2 of 18
Tokenizing text of subcorpus 3 of 18
Tokenizing text of subcorpus 4 of 18
Tokenizing text of subcorpus 5 of 18
Tokenizing text of subcorpus 6 of 18
Tokenizing text of subcorpus 7 of 18
Tokenizing text of subcorpus 8 of 18
Tokenizing text of subcorpus 9 of 18
Tokenizing text of subcorpus 10 of 18
Tokenizing text of subcorpus 11 of 18
Tokenizing text of subcorpus 12 of 18
Tokenizing text of subcorpus 13 of 18
Tokenizing text of subcorpus 14 of 18
Tokenizing text of subcorpus 15 of 18
Tokenizing text of subcorpus 16 of 18
Tokenizing text of subcorpus 17 of 18


In [6]:
print('Corpus number of sentences: %d' % len(sentences))
print('Corpus number of tokens: %d' % sum([len(sentence) for sentence in sentences]))

num_sent = 5
print('The first %d sentences' % num_sent)
for sentence in sentences[:num_sent]:
    print(sentence)


Corpus number of sentences: 658902
Corpus number of tokens: 14548903
The first 5 sentences
['I', 'love', 'all', 'of', 'the', 'new', 'editions', 'to', 'the', 'game', '.']
['I', 'love', 'that', 'it', 'finally', 'FEELS', 'like', 'pokemon', '.']
['The', 'problem', 'is', 'that', 'it', "'s", 'far', 'too', 'late', '.']
['The', 'fact', 'that', 'you', 'guys', 'were', 'so', 'eager', 'to', 'release', 'this', 'game', 'before', 'it', 'was', 'finished', ',', 'before', 'you', 'had', 'the', 'server', 'capacity', '(', 'that', "'s", 'not', 'totally', 'your', 'fault', 'though', 'this', 'game', 'initially', 'blew', 'up', 'more', 'than', 'anyone', 'could', 'have', 'thought', ')', ',', 'there', 'was', 'no', 'features', 'to', 'battle', ',', 'interact', 'with', 'friends', ',', 'no', 'raids', 'or', 'gym', 'battles', ',', 'the', 'radar', 'had', 'constant', 'problems', ',', 'you', 'guys', 'would', "n't", 'allow', 'anyone', 'to', 'use', 'radars', 'because', 'your', 'guys', "'", 'app', 'kind', 'of', 'sucked', 'at'

## 2. Load Word Embeddings

We need a word index, a dictionary which maps every word to a unique integer value (also with two special values for padding and unknown words: <PAD\> and <UNK\>).
Also we need an embedding_matrix, a numpy matrix which stores for each word from the word index a word embedding.

### Suggestion: Use pre-trained word embeddings trained on a large corpus

Some pre-trained word embeddings can be found here:<br>
 https://radimrehurek.com/gensim/models/keyedvectors.html<br>
 https://fasttext.cc/docs/en/english-vectors.html<br>
 https://nlp.stanford.edu/projects/glove/<br>
 https://www.spinningbytes.com/resources/wordembeddings/<br>
 https://code.google.com/archive/p/word2vec/
 

In [7]:
# for example, if you downloaded https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
from gensim.models import KeyedVectors

word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

nb_words = len(word2ind)+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word2ind.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)

INFO:loading projection weights from GoogleNews-vectors-negative300.bin.gz
DEBUG:{'transport_params': None, 'ignore_ext': False, 'opener': None, 'closefd': True, 'newline': None, 'errors': None, 'encoding': None, 'buffering': -1, 'mode': 'rb', 'uri': 'GoogleNews-vectors-negative300.bin.gz'}


FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin.gz'


### Alternatively: Compute Word2Vec embeddings yourself using corpus data
 Here I will use the Brown corpus. Please note that these embeddings probably might not work well for our data as the Brown corpus is relatively old and small.

In [8]:
# load library gensim (contains word2vec implementation)
import gensim

# ignore some warnings (probably caused by gensim version)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores

# load corpus data
import nltk

nltk.download('brown')
from nltk.corpus import brown

w2v_model = gensim.models.Word2Vec(min_count=20,
                                   window=2,
                                   size=300,
                                   sample=6e-5, 
                                   alpha=0.03, 
                                   min_alpha=0.0007, 
                                   negative=20,
                                   workers=cores-1)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [9]:
w2v_model.build_vocab(brown.sents(), progress_per=10000)
w2v_model.train(brown.sents(), total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
w2v_model.init_sims(replace=True)

INFO:collecting all words and their counts
INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:PROGRESS: at sentence #10000, processed 219770 words, keeping 23488 word types
INFO:PROGRESS: at sentence #20000, processed 430477 words, keeping 34367 word types
INFO:PROGRESS: at sentence #30000, processed 669056 words, keeping 42365 word types
INFO:PROGRESS: at sentence #40000, processed 888291 words, keeping 49136 word types
INFO:PROGRESS: at sentence #50000, processed 1039920 words, keeping 53024 word types
INFO:collected 56057 word types from a corpus of 1161192 raw words and 57340 sentences
INFO:Loading a fresh vocabulary
INFO:effective_min_count=20 retains 5164 unique words (9% of original 56057, drops 50893)
INFO:effective_min_count=20 leaves 1003451 word corpus (86% of original 1161192, drops 157741)
INFO:deleting the raw counts dictionary of 56057 items
INFO:sample=6e-05 downsamples 646 most-common words
INFO:downsampling leaves estimated 389561 word corpus 

DEBUG:job loop exiting, total 117 jobs
DEBUG:worker exiting, processed 39 jobs
INFO:worker thread finished; awaiting finish of 2 more threads
INFO:worker thread finished; awaiting finish of 1 more threads
DEBUG:worker exiting, processed 38 jobs
DEBUG:worker exiting, processed 40 jobs
INFO:worker thread finished; awaiting finish of 0 more threads
INFO:EPOCH - 8 : training on 1161192 raw words (389642 effective words) took 7.1s, 54943 effective words/s
INFO:EPOCH 9 - PROGRESS: at 10.57% examples, 44585 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 9 - PROGRESS: at 21.75% examples, 44935 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 9 - PROGRESS: at 35.48% examples, 48749 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 9 - PROGRESS: at 49.43% examples, 51930 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 9 - PROGRESS: at 62.56% examples, 54238 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 9 - PROGRESS: at 81.06% examples, 54339 words/s, in_qsize 0, out_qsize 0
DEBUG:job loop exiting, total 117 jobs
DE

INFO:EPOCH 16 - PROGRESS: at 93.29% examples, 51077 words/s, in_qsize 0, out_qsize 0
DEBUG:job loop exiting, total 117 jobs
DEBUG:worker exiting, processed 39 jobs
DEBUG:worker exiting, processed 39 jobs
INFO:worker thread finished; awaiting finish of 2 more threads
DEBUG:worker exiting, processed 39 jobs
INFO:worker thread finished; awaiting finish of 1 more threads
INFO:worker thread finished; awaiting finish of 0 more threads
INFO:EPOCH - 16 : training on 1161192 raw words (389554 effective words) took 7.7s, 50556 effective words/s
INFO:EPOCH 17 - PROGRESS: at 11.48% examples, 47507 words/s, in_qsize 1, out_qsize 0
INFO:EPOCH 17 - PROGRESS: at 24.43% examples, 49346 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 17 - PROGRESS: at 37.81% examples, 51109 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 17 - PROGRESS: at 48.67% examples, 50114 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 17 - PROGRESS: at 58.31% examples, 48906 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 17 - PROGRESS: at 69.81

INFO:EPOCH - 23 : training on 1161192 raw words (389357 effective words) took 9.1s, 42894 effective words/s
INFO:EPOCH 24 - PROGRESS: at 9.64% examples, 41103 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 20.81% examples, 43418 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 32.60% examples, 43933 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 43.73% examples, 44918 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 54.57% examples, 46450 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 66.39% examples, 47519 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 82.18% examples, 47048 words/s, in_qsize 0, out_qsize 0
INFO:EPOCH 24 - PROGRESS: at 98.03% examples, 46502 words/s, in_qsize 0, out_qsize 0
DEBUG:job loop exiting, total 117 jobs
DEBUG:worker exiting, processed 39 jobs
INFO:worker thread finished; awaiting finish of 2 more threads
DEBUG:worker exiting, processed 38 jobs
DEBUG:worker exiting, processed

In [10]:
word_index = {"<PAD>": 0, "<UNK>": 1}
embedding_matrix = np.random.uniform(-1, 1, (len(word_index) + len(w2v_model.wv.vocab), EMBEDDING_DIM))
for word in w2v_model.wv.vocab:
    index = len(word_index)
    word_index[word] = index
    embedding_matrix[index] = w2v_model.wv[word]

## 3. Text classification

In [88]:
# define the classification task

sentence_min_length = 20

In [89]:
# prepare the data: we first need to create integer vectors using our word index to represent words as numerical values for the input layer of our network.

# We also have to tokenize the data:
# To be precise, we might want to use the exact same tokenizer which was used for preparing the data for training the word embeddings!
# Here, the NLTK-tokenizer is used. NOTE: this might take a few minutes!
import nltk
nltk.download('punkt')

instances = {designer: [] for designer in course_corpus}
for designer, texts in course_corpus.items():
    logging.info('Processing texts of designer %s; %d instances so far.' % (designer, len(instances)))
    for text in texts:
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            if len(tokens) >= sentence_min_length:
                instances[designer].append([word_index.get(word, word_index["<UNK>"]) for word in tokens])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:Processing texts of designer Alexandre Carey; 18 instances so far.
INFO:Processing texts of designer Eunju Park; 18 instances so far.
INFO:Processing texts of designer Anthony Angrimson; 18 instances so far.
INFO:Processing texts of designer Paweena Tarepakdee; 18 instances so far.
INFO:Processing texts of designer Olalekan Olayemi; 18 instances so far.
INFO:Processing texts of designer Godwin Ezeani; 18 instances so far.
INFO:Processing texts of designer Atulya Praphul; 18 instances so far.
INFO:Processing texts of designer Muhammad Mehmood Ali; 18 instances so far.
INFO:Processing texts of designer Haolong Yan; 18 instances so far.
INFO:Processing texts of designer Patrick Schedlbauer; 18 instances so far.
INFO:Processing texts of designer Luis Diego Rosello Cordero; 18 instances so far.
INFO:Processing texts of designer Tanvi Vishw

In [104]:
# split into training and test data instances
import random
random.seed(RANDOM_SEED)

TEST_TRAIN_RATIO = 0.1

MAX_SEQUENCE_LENGTH = 40

def pad_input(sentences):
    return keras.preprocessing.sequence.pad_sequences(
        sentences, maxlen=MAX_SEQUENCE_LENGTH, dtype='int32', padding='pre', truncating='pre', value=word_index["<PAD>"])

train_labeled_data = []
test_labeled_data = []
designer_index = {}
for designer, designer_instances in instances.items():
    designer_index[designer] = len(designer_index)
    random.shuffle(designer_instances)
    test_labeled_data += [(inst, designer_index[designer]) for inst in designer_instances[:round(len(designer_instances)*TEST_TRAIN_RATIO)]]
    train_labeled_data += [(inst, designer_index[designer]) for inst in designer_instances[round(len(designer_instances)*TEST_TRAIN_RATIO):]]

random.shuffle(train_labeled_data)
train_data = pad_input([inst[0] for inst in train_labeled_data])
train_labels = [inst[1] for inst in train_labeled_data]

random.shuffle(test_labeled_data)
test_data = pad_input([inst[0] for inst in test_labeled_data])
test_labels = [inst[1] for inst in test_labeled_data]
#test_data = np.array(test_data)
#test_labels = np.array(test_labels)
logging.info('Train data instances: %d.' % len(train_data))
logging.info('Test data instances: %d.' % len(test_data))

INFO:Train data instances: 289913.
INFO:Test data instances: 32214.


In [100]:
print(max(train_labels))
print(min(train_labels))

17
0


In [101]:
#with tf.compat.v1.Session() as sesh:
#    train_labels = sesh.run(tf.one_hot(train_labels, 18))
    
    #y_val = sesh.run(tf.one_hot(y_val, 28))
 #   test_labels = sesh.run(tf.one_hot(test_labels,18))
#test_labels[:10]

In [126]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import InputLayer
from keras.layers.convolutional import Conv1D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout
model = Sequential()
model.add(InputLayer(input_shape=(40,)))
model.add(Embedding(len(word_index),
                                         EMBEDDING_DIM,
                                         weights=[embedding_matrix],
                                         input_length=40,
                                         trainable=False))
model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=34, kernel_size=2, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='sigmoid'))
model.add(Dense(len(course_corpus),
                                  activation='softmax'))
model.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])

In [141]:
train_limit = 30000
model.fit(np.array(train_data[:train_limit]),
          np.array(train_labels[:train_limit]),
          epochs=10,
          batch_size = 1012,
          verbose=1,
          validation_data=(np.array(test_data[:test_limit]), np.array(test_labels[:test_limit])))
#_, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x256bc559d68>

In [144]:
# test
test_limit = 10000
model.evaluate(np.array(test_data[:test_limit]), np.array(test_labels[:test_limit]))



[1.22384512424469, 0.6139000058174133]

In [145]:
print(test_labels[:10])
print([max([(ind, v) for ind, v in enumerate(pred)], key=lambda p: p[1])[0] for pred in model.predict(test_data[:10])])

[5, 1, 12, 8, 4, 3, 10, 16, 12, 17]
[5, 1, 12, 8, 8, 3, 10, 13, 12, 17]
