# 作業 : 變更不同的 GloVe 模型, 並使用 gensim 套件觀察模型表現

# [作業目標]
- 載入不同版本的 GloVe 模型, 並觀察模型間有何差異

# [作業重點]
- 觀察 GloVe 不同的預訓練詞向量, 效果間的差異

# Step 1
- 到 GloVe 官方網站 (https://github.com/stanfordnlp/GloVe), 由四組預訓練模型選擇一項下載
- 將模型檔解壓縮後, 選擇並複製詞向量檔到本程式同一執行目錄中
- 依照你所選擇的詞向量檔, 修改設定模型中的"input_file"與"output_file", 再執行後續程式

In [1]:
!pip install wget



In [2]:
import wget, requests, sys

def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] MiB" % (current / total * 100, current/1000000, total/1000000)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

# Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB)
glove_42b_300d = 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip' 

# Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB)
glove_820b_300d = 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip'

# Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB)
glove_6b = 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip'

# Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB)
glove_twitter_27b = 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip'


# download model
wget.download(glove_42b_300d, bar=bar_progress)
wget.download(glove_820b_300d, bar=bar_progress)
wget.download(glove_6b, bar=bar_progress)
wget.download(glove_twitter_27b, bar=bar_progress)

Downloading: 100% [1520 / 1520] MiB

'glove.twitter.27B.zip'

In [4]:
import zipfile
files = ['/content/glove.42B.300d.zip', '/content/glove.6B.zip', '/content/glove.twitter.27B.zip', '/content/glove.840B.300d.zip']

for path in files: 
  with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall('/content/word_vectors')

In [5]:
# 載入 gensim 與 GloVe 模型容器
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# 忽略警告訊息
import warnings
warnings.filterwarnings("ignore")

# Step 2 (Using 6B, 300D model) 
- 觀察變更預訓練詞向量的效果

In [8]:
# 設定模型
input_file_6b300d = '/content/word_vectors/glove.6B.300d.txt'

output_file = 'gensim_glove.6B.300d.txt'
glove2word2vec(input_file_6b300d, output_file)
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [9]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('girl', 0.7296419143676758),
 ('man', 0.6998662948608398),
 ('mother', 0.689943790435791),
 ('she', 0.6433226466178894),
 ('her', 0.6327143311500549),
 ('female', 0.6251603960990906),
 ('herself', 0.6215280294418335),
 ('person', 0.6170896887779236),
 ('women', 0.604761004447937),
 ('wife', 0.5986992120742798)]

In [10]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6713277101516724),
 ('princess', 0.5432624220848083),
 ('throne', 0.5386104583740234),
 ('monarch', 0.5347574949264526),
 ('daughter', 0.498025119304657)]

In [11]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [12]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.6998663

In [13]:
# 顯示字彙的詞向量
model['computer']

array([-2.7628e-01,  1.3999e-01,  9.8519e-02, -6.4019e-01,  3.1988e-02,
        1.0066e-01, -1.8673e-01, -3.7129e-01,  5.9740e-01, -2.0405e+00,
        2.2368e-01, -2.6314e-02,  7.2408e-01, -4.3829e-01,  4.8886e-01,
       -3.5486e-03, -1.0006e-01, -3.0587e-01, -1.5621e-01, -6.8136e-02,
        2.1104e-01,  2.9287e-01, -8.8861e-02, -2.0462e-01, -5.7602e-01,
        3.4526e-01,  4.1390e-01,  1.7917e-01,  2.5143e-01, -2.2678e-01,
       -1.0103e-01,  1.4576e-01,  2.0127e-01,  3.1810e-01, -7.8907e-01,
       -2.2194e-01, -2.4833e-01, -1.5103e-02, -2.0050e-01, -2.6441e-02,
        1.8551e-01,  3.3782e-01, -3.3543e-01,  8.6117e-01, -4.7083e-02,
       -1.7009e-01,  3.0438e-01,  9.4119e-02,  3.2435e-01, -8.1171e-01,
        8.8966e-01, -3.9149e-01,  1.6828e-01,  1.4316e-01,  3.6339e-03,
       -6.4557e-02,  4.5777e-02, -3.2248e-01,  4.8943e-02,  1.6817e-01,
        6.8344e-02,  5.4227e-01,  1.2493e-01,  6.9742e-01, -3.7194e-02,
        3.3080e-01, -4.2194e-01,  3.3970e-01,  2.7646e-01, -1.60

# Step 2
- 觀察變更預訓練詞向量的效果

In [15]:
input_file_42b300d = '/content/word_vectors/glove.42B.300d.txt'

output_file = 'gensim_glove.42B.300d.txt'
glove2word2vec(input_file_42b300d, output_file)
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [16]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('man', 0.8047993183135986),
 ('girl', 0.7628219127655029),
 ('women', 0.7070599794387817),
 ('she', 0.6970385313034058),
 ('lady', 0.687070369720459),
 ('mother', 0.6809671521186829),
 ('wife', 0.6668007969856262),
 ('female', 0.6592124700546265),
 ('her', 0.6492205262184143),
 ('person', 0.6446772217750549)]

In [17]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7833545804023743),
 ('prince', 0.5993281602859497),
 ('princess', 0.5823437571525574),
 ('elizabeth', 0.553723931312561),
 ('daughter', 0.544403076171875)]

In [18]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [19]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.8047992

In [20]:
# 顯示字彙的詞向量
model['computer']

array([-3.1564e-02,  1.2363e-01, -1.7034e-01,  1.7133e-01,  3.6865e-01,
       -7.9461e-01, -3.5579e+00,  1.7796e-01,  7.5441e-01,  5.8339e-01,
       -1.1223e-01, -4.2855e-02,  3.0289e-01,  7.3101e-02, -5.7529e-01,
        5.1862e-01, -6.3629e-02, -4.7917e-01, -2.7654e-01, -1.3803e-01,
       -2.3805e-01, -1.0256e-01,  1.9888e-01, -5.4682e-02, -2.4799e-01,
        9.8878e-02,  4.0410e-01,  2.0282e-01, -1.6771e-01, -5.6312e-01,
       -1.1060e-01, -9.1475e-02, -2.2039e-01, -3.4514e-01, -1.7298e-01,
        2.6311e-01, -2.1555e-01,  4.3417e-01,  1.7068e-01, -3.2459e-01,
       -3.0770e-01, -5.4136e-01, -2.1150e-01, -7.7317e-02,  1.9324e-02,
        2.3751e-01,  1.3064e-01,  3.1118e-01,  8.6216e-04, -1.7932e-01,
       -2.5800e-01,  3.5938e-01,  2.5808e-01,  2.1604e-01,  2.6730e-02,
       -2.1068e-01,  8.7728e-02, -1.6411e-01, -7.4360e-01, -1.6145e-01,
       -9.5305e-02,  3.0688e-01,  2.4271e-01, -3.9576e-01, -3.8444e-02,
        4.4282e-01, -2.4518e-01, -1.5674e-02, -1.2109e-01, -2.76

# Step 2
- 觀察變更預訓練詞向量的效果

In [21]:
input_file_840b300d = '/content/word_vectors/glove.840B.300d.txt'

output_file = 'gensim_glove.840B.300d.txt'
glove2word2vec(input_file_840b300d, output_file)
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [22]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('lady', 0.7816672921180725),
 ('girl', 0.7700794339179993),
 ('man', 0.7401745319366455),
 ('women', 0.7178930044174194),
 ('mother', 0.6826746463775635),
 ('female', 0.6689871549606323),
 ('she', 0.6653776168823242),
 ('wife', 0.6570038795471191),
 ('pregnant', 0.6429666876792908),
 ('herself', 0.6276910901069641)]

In [23]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.775162398815155),
 ('prince', 0.6123066544532776),
 ('princess', 0.6016970872879028),
 ('kings', 0.5996100902557373),
 ('queens', 0.565579891204834)]

In [24]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [25]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.7401744

In [26]:
# 顯示字彙的詞向量
model['computer']

array([-0.26657  , -0.13717  ,  0.23549  , -0.26712  ,  0.093324 ,
        0.17563  , -0.33309  , -0.81744  ,  0.52655  ,  1.588    ,
        0.078138 , -0.093094 , -0.27167  , -0.31207  , -0.35018  ,
        0.072898 ,  0.032007 ,  2.5085   , -0.35727  , -0.018104 ,
        0.26102  , -0.3157   , -0.46466  ,  0.029905 ,  0.576    ,
       -0.34603  , -0.1502   , -0.23481  ,  0.20626  ,  0.28202  ,
       -0.6897   , -0.17632  ,  0.40369  ,  0.40672  ,  0.13068  ,
       -0.059988 , -0.3405   ,  0.46314  , -0.21149  ,  0.033152 ,
        0.36526  ,  0.049521 , -0.096128 , -0.087093 , -0.12957  ,
        0.22673  , -0.50435  , -0.50732  ,  0.27695  ,  0.14523  ,
       -0.11455  , -0.13968  ,  0.38549  ,  0.26088  , -0.43217  ,
       -0.45305  ,  0.072362 , -0.53383  ,  0.23317  ,  0.10471  ,
        0.51278  , -0.05899  ,  0.06051  ,  0.22083  ,  0.63098  ,
       -0.03232  , -0.36192  ,  0.45576  ,  0.037796 ,  0.21651  ,
        0.44988  , -0.33074  ,  0.32553  , -0.30153  , -0.2041

# Step 2
- 觀察變更預訓練詞向量的效果

In [29]:
input_file_27b200d = '/content/word_vectors/glove.twitter.27B.200d.txt'

output_file = 'gensim_glove.27B.200d.txt'
glove2word2vec(input_file_27b200d, output_file)
model = KeyedVectors.load_word2vec_format(output_file, binary=False)

In [30]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('girl', 0.7817050218582153),
 ('women', 0.7705847024917603),
 ('guy', 0.7154314517974854),
 ('she', 0.7104362845420837),
 ('person', 0.7034647464752197),
 ('wife', 0.7029582858085632),
 ('female', 0.7000529766082764),
 ('mother', 0.6994998455047607),
 ('lady', 0.6945761442184448),
 ('who', 0.6705518960952759)]

In [31]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6820898056030273),
 ('prince', 0.5875527262687683),
 ('princess', 0.5620489120483398),
 ('royal', 0.5522865056991577),
 ('mother', 0.5362966656684875)]

In [32]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [33]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.6613607

In [34]:
# 顯示字彙的詞向量
model['computer']

array([ 1.8661e-01,  9.8299e-02,  3.9007e-02, -2.2238e-01, -5.9233e-02,
        1.1812e-01,  4.9759e-01, -2.8316e-01,  2.6596e-01, -4.2649e-01,
       -1.2116e-01,  5.7061e-01, -1.2749e+00, -1.0381e+00, -9.3964e-01,
       -5.2529e-01,  2.8880e-02,  2.8233e-01, -3.9453e-02, -5.2796e-01,
       -3.2264e-01,  9.4211e-02,  7.8902e-02,  4.8493e-01, -5.9542e-02,
        7.0031e-01,  5.1888e-01, -2.4485e-01, -1.0733e+00,  3.0477e-01,
        2.9591e-01, -8.7483e-02,  1.3233e-01, -3.9865e-01,  8.0554e-01,
       -5.8224e-01, -8.7507e-03,  2.7913e-01,  3.3062e-01,  2.3157e-01,
        1.3149e-01,  2.3234e-01, -7.5150e-01,  8.8665e-02, -1.8849e-01,
        5.8511e-01,  5.2439e-01,  3.2801e-01, -9.6899e-01,  6.8633e-01,
        2.9114e-02,  2.5451e-01, -3.2320e-01, -9.0662e-02,  2.5203e-01,
       -5.1843e-01, -8.3203e-02, -1.0830e-01, -5.9551e-02, -1.8399e-01,
        5.2316e-01,  5.7802e-02,  5.5980e-01,  4.6475e-02, -4.0023e-01,
       -4.5896e-01,  1.7354e-01, -2.0696e-01,  2.3390e-01, -4.37