In [4]:
from gensim.models import word2vec


# Settings
seed = 5201314
sg = 0
window_size = 25
vector_size = 300
min_count = 2
workers = 60
epochs = 100
batch_words = 400

train_data = word2vec.LineSentence('../training_data/process_content.txt')
model = word2vec.Word2Vec(
    train_data,
    min_count=min_count,
    vector_size=vector_size,
    workers=workers,
    epochs=epochs,
    window=window_size,
    sg=sg,
    seed=seed,
    batch_words=batch_words
)

model.save('word2vec.model')

In [5]:
model = word2vec.Word2Vec.load('word2vec.model')
for item in model.wv.most_similar('鏡頭'):
    print(item)

('冷調', 0.16146089136600494)
('便宜', 0.15765826404094696)
('名字', 0.15757183730602264)
('讚嘆', 0.15454351902008057)
('綠色', 0.1512092649936676)
('bionic', 0.15001274645328522)
('找到', 0.1460101455450058)
('方法', 0.1436985284090042)
('抽獎', 0.14083179831504822)
('神經', 0.14052189886569977)


In [6]:
import pandas as pd
score_list = []
raw_content = pd.read_csv('../training_data/Content_cut.csv')
content_list = eval(raw_content.loc[0,'content_cut'])
for i in content_list:
    try:
        score = model.wv.similarity('電池', i)
        score_list.append([score, i])
    except:
        continue
score_df = pd.DataFrame(data = score_list, columns=['Score', 'Word'])
score_df = score_df.drop_duplicates(subset=['Score', 'Word'], keep=False)
score_df.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Score,Word
6583,0.049641,下訂
2072,0.048014,18日
4415,0.030655,cpu
6870,0.03006,soc
4113,0.027233,ipad
4473,0.013239,2016年
7070,0.008693,wifi6
5464,-0.014296,home
3270,-0.01468,俗
6261,-0.03352,mah


### using TMU model

In [13]:
import gensim
from gensim.models.word2vec import Word2Vec
model = gensim.models.KeyedVectors.load_word2vec_format('../../graduation_project_outfiles/tmunlp_1.6B_WB_300dim_2020v1.bin.gz', 
                                                        unicode_errors='ignore', 
                                                        binary=True)

In [20]:
import pandas as pd
score_list = []
raw_content = pd.read_csv('../training_data/Content_cut.csv')
content_list = eval(raw_content.loc[0,'content_cut'])
for i in content_list:
    try:
        score = model.similarity('電池', i)
        score_list.append([score, i])
    except:
        continue
score_df = pd.DataFrame(data = score_list, columns=['Score', 'Word'])
score_df = score_df.drop_duplicates(subset=['Score', 'Word'], keep=False)
score_df = score_df.sort_values(by=['Score'], ascending=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(score_df)

          Score                                               Word
22381  0.483037                                                 蓄電
1513   0.451125                                                 零件
15769  0.449044                                                記憶卡
27053  0.448280                                                 插頭
6131   0.447982                                                 副廠
16883  0.423784                                                 車架
8009   0.420089                                                 充飽
24610  0.412815                                                 天線
7007   0.399819                                                 外殼
16163  0.389789                                                電電池
8829   0.382184                                                 面板
19686  0.382049                                                mah
26148  0.381764                                                 備用
24104  0.377900                                               

### filter by Part of Speech

In [16]:
raw_content = pd.read_csv('../training_data/Content_cut.csv')
raw_content_list = []
for index, row in raw_content.iterrows():
    if(index<1):
        temp_list = row['content_cut']
        raw_content_list.extend(eval(temp_list))
        
raw_pos_list = []
for index, row in raw_content.iterrows():
    if(index<1):
        temp_pos_list = row['part of speech']
        raw_pos_list.extend(eval(temp_pos_list))
        
content_pos_list = pd.DataFrame(data = raw_content_list, columns=['Content'])
pos_list = pd.DataFrame(data = raw_pos_list, columns=['PoS'])
content_pos_list['PoS'] = pos_list['PoS']
content_pos_list

Unnamed: 0,Content,PoS
0,三月,Nd
1,中,Ng
2,買,VC
3,的,DE
4,SE3,FW
...,...,...
30284,HK,FW
30285,$,FW
30286,3699,Neu
30287,,WHITESPACE


In [22]:
import pandas as pd
score_list = []

for i in range(len(content_pos_list)):
    try:
        if content_pos_list['PoS'][i] in ['A']:
            score = model.similarity('電池', content_pos_list['Content'][i]) 
            score_list.append([score, content_pos_list['Content'][i]])
    except:
        continue
score_df_by_PoS = pd.DataFrame(data = score_list, columns=['Score', 'Word'])
score_df_by_PoS = score_df_by_PoS.drop_duplicates(subset=['Score', 'Word'], keep=False)
score_df_by_PoS = score_df_by_PoS.sort_values(by=['Score'], ascending=False)
score_df_by_PoS
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(score_df_by_PoS)

        Score Word
110  0.211798  高性能
34   0.190513   軍用
148  0.177425   現有
165  0.175725   高度
174  0.158923   雙頻
23   0.137169   新款
10   0.112052   多核
173  0.111237   彩色
176  0.099359   商用
133  0.090624   單層
126  0.073644   多工
14   0.071817   同級
40   0.061537   原本
2    0.059966   初步
144  0.053641   護眼
153  0.048136   二手
63   0.036843   微距
46   0.027470   既有
119  0.025214   非人
52   0.022232   中階
20   0.016112   人為
47   0.010208   新興
49   0.007524   重度
21   0.001362   特約
114 -0.005455   細部
107 -0.027878    主
115 -0.032614  一連串
6   -0.038119   中等
168 -0.047184   原來
89  -0.106170   純粹
