In [None]:
import gensim
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import os
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

Using Theano backend.


In [2]:
# Load & train word2vec model
model = gensim.models.Word2Vec.load('/Users/AUM/Documents/PROJECT/mockup_senior_project/thai_text_mock_up/word2vec/w2v.bin')

In [3]:
words = []
for word in model.wv.vocab:
    words.append(word)

In [4]:
# Printing out number of tokens available
print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 74664


In [30]:
# Pick a word 
find_similar_to = '00-0000-0000'

# Finding out similar words [default= top 10]
for similar_word in model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: 0-0000-0000, Similarity: 0.81
Word: 00-000-0000, Similarity: 0.75
Word: 00000, Similarity: 0.67
Word: 0 0000 0000, Similarity: 0.67
Word: โทร., Similarity: 0.66
Word: N00, Similarity: 0.63
Word: อีเมล์, Similarity: 0.62
Word: 0-0000-0000-00, Similarity: 0.62
Word: 000-0000000, Similarity: 0.60
Word: เวสเทิร์นยูเนี่ยน, Similarity: 0.60


In [47]:
# Test words 
word_add = ['นายกรัฐมนตรี', 'นาง']
word_sub = ['นาย']

# Word vector addition and subtraction 
for resultant_word in model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : จุฬาราชมนตรี , Similarity: 0.67
Word : ปิยะสกล , Similarity: 0.65
Word : เมฆสวรรค์ , Similarity: 0.64
Word : มิลล์วีนา , Similarity: 0.64
Word : เลขานุการ , Similarity: 0.64
Word : พระครู , Similarity: 0.63
Word : หงษ์เหิน , Similarity: 0.63
Word : ม.ร.ว. , Similarity: 0.63
Word : ปวีณา , Similarity: 0.63
Word : ติณสูลานนท์ , Similarity: 0.63


In [59]:
# Limit number of tokens to be visualized
limit = 500
vector_dim = 100
# Getting tokens and vectors
words = []
embedding = np.array([])
i = 0
for word in model.wv.vocab:
    # Break the loop if limit exceeds 
    if i == limit: break

    # Getting token 
    words.append(word)

    # Appending the vectors 
    embedding = np.append(embedding, model[word])

    i += 1

# Reshaping the embedding vector 
embedding = embedding.reshape(limit, vector_dim)

In [67]:
ThaiFont = FontProperties(fname = '/Users/AUM/Library/Fonts/THSarabunChula-Regular.ttf')

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom',
                 fontproperties = ThaiFont)
    plt.savefig(filename, dpi=300)
    
# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)

low_dim_embedding = tsne.fit_transform(embedding)

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, words)