# Overview of Popular Word Embeddings
## Overall Size

In [23]:
import pandas as pd

conceptnet = pd.read_hdf('mini.h5')
conceptnet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1635499 entries, /c/de/####er to /c/zh/𫚉鱼
Columns: 300 entries, 0 to 299
dtypes: int8(300)
memory usage: 480.4+ MB


In [1]:
import fasttext.util
fasttext.util.download_model('zh', if_exists='ignore')
ft = fasttext.load_model('cc.zh.300.bin')

In [24]:
!ls -lh cc.zh.300.bin*

-rw-r--r--  1 Ken  staff   6.7G Jan  1 11:05 cc.zh.300.bin
-rw-r--r--@ 1 Ken  staff   4.2G Jan  1 11:01 cc.zh.300.bin.gz


## Chinese vocabs

In [25]:
conceptnet_zh = conceptnet[conceptnet.index.str.startswith('/c/zh')]
conceptnet_zh.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270235 entries, /c/zh/##cm to /c/zh/𫚉鱼
Columns: 300 entries, 0 to 299
dtypes: int8(300)
memory usage: 79.4+ MB


In [26]:
len(ft.words)

2000000

## Simple words compare

In [56]:
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

def word_similarity(get_vector, word1: str, word2: str) -> float:
    vec1 = get_vector(word1)
    vec2 = get_vector(word2)
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

def compare_words(get_vector):
    word_list = [
        '歡欣', '歡樂', '喜樂', '快樂', '高興', '喜樂', '服事', '事奉', '敬畏', '害怕', '祈禱',
        '是', '祈禱', '禱告', '要说', '挂虑', '喜樂', '苦難', '喜乐', '苦难', '喜乐', '荣耀', '喜乐', '财利'
    ]
    word_list = pd.Series(word_list).unique()
    word_combinations = combinations(word_list, 2)
    word_scores = {word_pair: word_similarity(get_vector, *word_pair)
                   for word_pair in word_combinations if word_pair[0] != word_pair[1]}
    for pair in sorted(word_scores, key=word_scores.get, reverse=True)[:10]:
        print(f'{pair}: {word_scores[pair]}')

In [60]:
def ccn_get_word_vector(word):
    return conceptnet_zh.loc[f'/c/zh/{word}'].to_numpy()

compare_words(ccn_get_word_vector)

('喜樂', '喜乐'): 0.9092429111196532
('苦難', '苦难'): 0.898737950176371
('祈禱', '禱告'): 0.857233449674653
('快樂', '高興'): 0.8516892402941497
('歡樂', '快樂'): 0.8374214375408773
('服事', '事奉'): 0.7964751831365556
('歡樂', '高興'): 0.7595223770702401
('歡樂', '喜樂'): 0.7027476227631397
('歡樂', '喜乐'): 0.633865849560741
('喜樂', '快樂'): 0.5790454036495138


In [58]:
compare_words(ft.get_word_vector)

('服事', '事奉'): 0.7572233080863953
('祈禱', '禱告'): 0.7417813539505005
('苦難', '苦难'): 0.6590461730957031
('歡樂', '快樂'): 0.6331542134284973
('喜樂', '快樂'): 0.5811575651168823
('喜樂', '喜乐'): 0.5609360337257385
('喜樂', '禱告'): 0.556587815284729
('快樂', '高興'): 0.5492446422576904
('歡欣', '歡樂'): 0.5454975962638855
('喜樂', '苦難'): 0.5265867710113525


In [67]:
word_similarity(ccn_get_word_vector, '喜樂', '歡歡喜喜')

0.2727139927512483