In [1]:
import gensim, os, re
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

Using Theano backend.


In [44]:
# Load & train word2vec model
model = gensim.models.Word2Vec.load('/Users/AUM/Documents/PROJECT/mockup_senior_project/thai_text_mock_up/word2vec/w2v.bin')

In [12]:
words = []
for word in model.wv.vocab:
    words.append(word)

In [4]:
# Printing out number of tokens available
print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 30225


In [6]:
# Pick a word 
find_similar_to = 'ลาดพร้าว'

# Finding out similar words [default= top 10]
for similar_word in model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: สุขุมวิท, Similarity: 0.71
Word: เอกมัย, Similarity: 0.68
Word: สีลม, Similarity: 0.67
Word: รัชดาภิเษก, Similarity: 0.67
Word: พญาไท, Similarity: 0.63
Word: เพชรเกษม, Similarity: 0.61
Word: อังรีดูนังต์, Similarity: 0.59
Word: สุวินทวงศ์, Similarity: 0.59
Word: จตุจักร, Similarity: 0.58
Word: พลาซ่า, Similarity: 0.58


In [14]:
# Test words 
word_add = ['นางสาว', 'ค่ะ']
word_sub = ['นาย']

# Word vector addition and subtraction 
for resultant_word in model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : ครับ , Similarity: 0.59
Word : ครับ? , Similarity: 0.52
Word : คับ , Similarity: 0.49
Word : ครับ... , Similarity: 0.48
Word : คะ? , Similarity: 0.48
Word : เหรอ , Similarity: 0.46
Word : ดิฉัน , Similarity: 0.46
Word : อ่ะ , Similarity: 0.45
Word : ปล. , Similarity: 0.42
Word : อะไร , Similarity: 0.42


In [59]:
# Limit number of tokens to be visualized
limit = 500
vector_dim = 100
# Getting tokens and vectors
words = []
embedding = np.array([])
i = 0
for word in model.wv.vocab:
    # Break the loop if limit exceeds 
    if i == limit: break

    # Getting token 
    words.append(word)

    # Appending the vectors 
    embedding = np.append(embedding, model[word])

    i += 1

# Reshaping the embedding vector 
embedding = embedding.reshape(limit, vector_dim)

In [67]:
ThaiFont = FontProperties(fname = '/Users/AUM/Library/Fonts/THSarabunChula-Regular.ttf')

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom',
                 fontproperties = ThaiFont)
    plt.savefig(filename, dpi=300)
    
# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)

low_dim_embedding = tsne.fit_transform(embedding)

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, words)