In [1]:
from gensim.models import FastText
from gensim.models.word2vec import LineSentence
import multiprocessing

WIKI_SEG_TXT = "wiki_seg.txt"

max_cpu_counts = multiprocessing.cpu_count()
word_dim_size = 300 

sentences = LineSentence(WIKI_SEG_TXT)

model = FastText(sentences, vector_size=word_dim_size, workers=max_cpu_counts)

output_model = f"fasttext.zh.{word_dim_size}.model"
model.save(output_model)

print(model.wv.vectors.shape)
model.wv.vectors

model = FastText.load('fasttext.zh.300.model') 
vocab = model.wv.key_to_index

print(f"總共收錄了 {len(vocab)} 個詞彙")

print("印出 20 個收錄詞彙:")
print(list(vocab.keys())[:20])

(1281108, 300)
總共收錄了 1281108 個詞彙
印出 20 個收錄詞彙:
['年', '月', '日', '中', '10', '12', '11', '小行星', '中國', '時', '–', '日本', '美國', '20', '香港', '臺灣', '15', '位於', '30', '站']


In [2]:
vec = model.wv['數學家']
print(vec.shape)
vec 

(300,)


array([-2.40173921e-01, -1.11621618e-03, -1.26872182e+00,  6.97947383e-01,
       -1.07644582e+00, -6.57272637e-01, -2.25588131e+00, -7.36904502e-01,
        7.41719127e-01, -6.31970823e-01,  1.29935861e+00,  6.28995061e-01,
        1.59047210e+00,  6.71653748e-01, -7.50802875e-01,  1.89570022e+00,
        1.70877051e+00, -1.40614212e+00,  8.60415339e-01, -3.44749284e+00,
       -1.06719100e+00, -6.77665651e-01,  5.80408573e-01, -2.30039167e+00,
        1.19669712e+00,  9.24629271e-01,  4.42754894e-01,  1.17526305e+00,
        6.15279675e-01,  2.01202065e-01, -2.06836200e+00, -2.81031775e+00,
        8.38559926e-01,  6.51966512e-01, -1.56953883e+00, -1.37281775e+00,
       -7.60987341e-01,  6.63723284e-03,  9.61589813e-03,  5.70674241e-01,
        1.66490090e+00, -2.67470741e+00, -1.38995543e-01, -8.97718146e-02,
       -4.06953394e-01, -2.78019118e+00,  5.66998899e-01,  3.60473722e-01,
        4.64375496e-01,  3.86218280e-01, -3.82440257e+00, -5.71132042e-02,
       -4.35999595e-02,  

In [3]:
word = "這肯定沒見過 "

# 若強行取值會報錯
try:
    vec = model.wv[word]
except KeyError as e:
    print(e)

In [4]:
model.wv.most_similar("飲料", topn=10)

[('輝劍', 0.9708890914916992),
 ('名松', 0.9569700360298157),
 ('飲料類', 0.9305379986763),
 ('飲料機', 0.9210789799690247),
 ('飲料罐', 0.9001978635787964),
 ('軟飲料', 0.887269139289856),
 ('經米濱', 0.8780335783958435),
 ('茶飲料', 0.8691356182098389),
 ('飲品', 0.846047580242157),
 ('廣慈宮', 0.797712504863739)]

In [5]:
model.wv.most_similar("car")

[('carcar', 0.8578386902809143),
 ('hcar', 0.8530926704406738),
 ('ccar', 0.8385680317878723),
 ('jetcar', 0.8157699704170227),
 ('zipcar', 0.8092700839042664),
 ('necar', 0.8080305457115173),
 ('tramcar', 0.807368814945221),
 ('cars', 0.8029255270957947),
 ('motorcar', 0.7994137406349182),
 ('indycar', 0.7992186546325684)]

In [6]:
model.wv.most_similar("facebook")

[('youtubefacebook', 0.9263174533843994),
 ('thefacebook', 0.9004828929901123),
 ('facebookpage', 0.8896870613098145),
 ('facebox', 0.8654385805130005),
 ('instagram', 0.8053536415100098),
 ('twitteryoutube', 0.7763373851776123),
 ('twitter', 0.7609679102897644),
 ('googleyoutube', 0.7585869431495667),
 ('youtube', 0.745212197303772),
 ('lnstagram', 0.7385021448135376)]

In [7]:
model.wv.most_similar("詐欺")

[('賈邱', 0.8894781470298767),
 ('赤坑鎮', 0.8356450796127319),
 ('中境', 0.810566246509552),
 ('越中境', 0.7931427955627441),
 ('詐欺罪', 0.784037172794342),
 ('他魚', 0.7666366696357727),
 ('欺詐', 0.7374577522277832),
 ('抱出', 0.7082515954971313),
 ('欺詐案', 0.6659218072891235),
 ('義德堂', 0.6421015858650208)]

In [8]:
model.wv.most_similar("合約")

[('德康', 0.9161204695701599),
 ('合同', 0.8144538998603821),
 ('綠蠅', 0.7605901956558228),
 ('合同額', 0.7385705709457397),
 ('合同期', 0.7291205525398254),
 ('合同商', 0.7239711880683899),
 ('籤合同', 0.716241717338562),
 ('合同制', 0.710429310798645),
 ('簽約', 0.6969977617263794),
 ('合同條款', 0.6920885443687439)]

In [9]:
model.wv.similarity("連結", "鍵接")

0.42695913

In [10]:
model.wv.similarity("連結", "陰天")

-0.03664172

In [11]:
print(f"Loading {output_model}...")
new_model = FastText.load(output_model)

Loading fasttext.zh.300.model...


In [12]:
model.wv.similarity("連結", "陰天") == new_model.wv.similarity("連結", "陰天")

True