# Overview of Popular Word Embeddings
## Overall Size

In [9]:
import pandas as pd

conceptnet = pd.read_hdf('mini.h5')
conceptnet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1635499 entries, /c/de/####er to /c/zh/𫚉鱼
Columns: 300 entries, 0 to 299
dtypes: int8(300)
memory usage: 480.4+ MB


In [1]:
import fasttext.util
fasttext.util.download_model('zh', if_exists='ignore')
ft = fasttext.load_model('cc.zh.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz



In [31]:
!ls -lh cc.zh.300.bin*

-rw-r--r--  1 Ken  staff   6.7G Jan  1 11:05 cc.zh.300.bin
-rw-r--r--@ 1 Ken  staff   4.2G Jan  1 11:01 cc.zh.300.bin.gz


## Chinese vocabs

In [10]:
conceptnet_zh = conceptnet[conceptnet.index.str.startswith('/c/zh')]
conceptnet_zh.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270235 entries, /c/zh/##cm to /c/zh/𫚉鱼
Columns: 300 entries, 0 to 299
dtypes: int8(300)
memory usage: 79.4+ MB


In [11]:
len(ft.words)

2000000

## Simple words compare

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def word_similarity(get_vector, word1: str, word2: str) -> float:
    vec1 = get_vector(word1)
    vec2 = get_vector(word2)
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def compare_words(get_vector):
    word_list = [
        ('歡欣', '歡樂'),
        ('喜樂', '快樂'),
        ('高興', '喜樂'),
        ('服事', '事奉'),
        ('敬畏', '害怕'),
        ('祈禱', '是'),
        ('祈禱', '禱告'),
        ('要说', '挂虑'),
        ('喜樂', '苦難'),
        ('喜乐', '苦难'),
        ('喜乐', '荣耀'),
        ('喜乐', '财利')
    ]
    for word_pair in word_list:
        print(f'{word_pair}: {word_similarity(get_vector, *word_pair)}')

In [25]:
compare_words(lambda word: conceptnet_zh.loc[f'/c/zh/{word}'].to_numpy())

('歡欣', '歡樂'): 0.3720811684652536
('喜樂', '快樂'): 0.5790454036495138
('高興', '喜樂'): 0.4737319536936434
('服事', '事奉'): 0.7964751831365556
('敬畏', '害怕'): 0.288557222094775
('祈禱', '是'): 0.08305685928015732
('祈禱', '禱告'): 0.857233449674653
('要说', '挂虑'): 0.3776188264138286
('喜樂', '苦難'): 0.2665470615835611
('喜乐', '苦难'): 0.31398705785159253
('喜乐', '荣耀'): 0.35288771270497
('喜乐', '财利'): 0.28057389650354214


In [26]:
compare_words(ft.get_word_vector)

('歡欣', '歡樂'): 0.5454975962638855
('喜樂', '快樂'): 0.5811575651168823
('高興', '喜樂'): 0.4076387882232666
('服事', '事奉'): 0.7572233080863953
('敬畏', '害怕'): 0.4587634205818176
('祈禱', '是'): 0.08488503843545914
('祈禱', '禱告'): 0.7417813539505005
('要说', '挂虑'): 0.27075880765914917
('喜樂', '苦難'): 0.5265867710113525
('喜乐', '苦难'): 0.49321168661117554
('喜乐', '荣耀'): 0.43432387709617615
('喜乐', '财利'): 0.3900814652442932
