[View in Colaboratory](https://colab.research.google.com/github/Masum06/gender_newspaper/blob/master/Create_Embeddings.ipynb)

In [0]:
import gensim, logging, os, re, string, tensorflow
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from gensim.utils import simple_preprocess

print('gensim version: \t%s'     % gensim.__version__)
print('TensorFlow version: \t%s' % tensorflow.__version__)

gensim version: 	3.4.0
TensorFlow version: 	1.8.0


## Config

In [0]:
# For displaying gensim logs
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

# Directory with raw txt-files
TEXT_DIR  = 'data/yelp/train'

# Directory for saving checkpoint and metadata
MODEL_DIR = 'emb_yelp/'

# Word2vec
EMBEDDING_SIZE = 300

## Preprocessing

In [0]:
def clean_doc(doc):
    """
    Cleaning a document by several methods
    """
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)


def read_files(path):
    """
    Read in text files
    """
    documents = list()
    tokenize  = lambda x: simple_preprocess(x)
    
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename), encoding='utf-8') as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(tokenize(doc))
    return documents

docs = read_files(TEXT_DIR)
print('Number of documents: %i' % len(docs))

Number of documents: 200000


## Training model

In [0]:
model = gensim.models.Word2Vec(docs, size=EMBEDDING_SIZE)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 1332433 words, keeping 34615 word types
INFO : PROGRESS: at sentence #20000, processed 2682863 words, keeping 51033 word types
INFO : PROGRESS: at sentence #30000, processed 4013459 words, keeping 63788 word types
INFO : PROGRESS: at sentence #40000, processed 5386954 words, keeping 75163 word types
INFO : PROGRESS: at sentence #50000, processed 6725433 words, keeping 85157 word types
INFO : PROGRESS: at sentence #60000, processed 8069244 words, keeping 94511 word types
INFO : PROGRESS: at sentence #70000, processed 9429856 words, keeping 103236 word types
INFO : PROGRESS: at sentence #80000, processed 10766708 words, keeping 111540 word types
INFO : PROGRESS: at sentence #90000, processed 12117230 words, keeping 119401 word types
INFO : PROGRESS: at sentence #100000, processed 13487814 words, keeping 127178 word types
INFO

INFO : PROGRESS: at 45.92% examples, 549012 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 46.49% examples, 549227 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 47.02% examples, 549361 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 47.57% examples, 549318 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 48.12% examples, 549365 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 48.66% examples, 549447 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 49.21% examples, 549622 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 49.75% examples, 549740 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 50.39% examples, 549725 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 51.15% examples, 549719 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 51.92% examples, 549813 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 52.65% examples, 549782 words/s, in_qsize 5, out_qsize 0
INFO : PROGRESS: at 53.41% examples, 549901 words/s, in_qsize 5, out_qsize 0

## Saving model

In [0]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model.save(os.path.join(MODEL_DIR,'word2vec'))

INFO : saving Word2Vec object under emb_yelp/word2vec, separately None
INFO : not storing attribute vectors_norm
INFO : not storing attribute cum_table
INFO : saved emb_yelp/word2vec


## Creating checkpoint and metadata

In [0]:
weights     = model.wv.vectors
index_words = model.wv.index2word

vocab_size    = weights.shape[0]
embedding_dim = weights.shape[1]

print('Shape of weights:', weights.shape)
print('Vocabulary size: %i' % vocab_size)
print('Embedding size: %i'  % embedding_dim)

with open(os.path.join(MODEL_DIR,'metadata.tsv'), 'w') as f:
    f.writelines("\n".join(index_words))

# Required if you re-run without restarting the kernel
tf.reset_default_graph()

W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_dim]), trainable=False, name="W")
embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_dim])

embedding_init = W.assign(embedding_placeholder)
writer = tf.summary.FileWriter(MODEL_DIR, graph=tf.get_default_graph())
saver = tf.train.Saver()

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = W.name
embedding.metadata_path = './metadata.tsv'
projector.visualize_embeddings(writer, config)

with tf.Session() as sess:
    sess.run(embedding_init, feed_dict={embedding_placeholder: weights})
    save_path = saver.save(sess, os.path.join(MODEL_DIR, "model.cpkt"))

Shape of weights: (42113, 300)
Vocabulary size: 42113
Embedding size: 300


## Example

In [0]:
model.wv.most_similar(positive=['coffee'], topn=10)

INFO : precomputing L2-norms of word weight vectors


[('espresso', 0.6709840893745422),
 ('latte', 0.6611574292182922),
 ('cappuccino', 0.6460868716239929),
 ('tea', 0.643097996711731),
 ('lattes', 0.613446056842804),
 ('coffees', 0.612466037273407),
 ('teas', 0.5807890295982361),
 ('chai', 0.567467451095581),
 ('mocha', 0.565311074256897),
 ('gelato', 0.5606527328491211)]