## 讀取中⽂語料庫檔案並轉換為繁體

In [1]:
from opencc import OpenCC
import json
import re

cc = OpenCC('s2twp')  # Initial (簡體中文 -> 繁體中文 (台灣, 包含慣用詞轉換))

jsontext = []

#讀取裡面有100個檔案的資料夾
file = ['AA','AB','AC','AD','AE','AF','AG','AH','AI','AJ','AK','AL']
for filename in file:
    for i in range(100):
        with open('/Users/Joanna/Downloads/wiki_zh/{}/wiki_{}'.format(filename, str(i).zfill(2)), 'r', encoding = 'utf-8') as f:
            temp_jsontext = []
            for line in f:
                temp_jsontext = json.loads(line)
                x = re.sub('\s+', '', cc.convert(temp_jsontext['text']))
                jsontext.append(re.sub('[^\u4e00-\u9fa5]+', ' ', x) + '\n')
                
#讀取裡面有74個檔案的資料夾               
for i in range(74):
    with open('/Users/Joanna/Downloads/wiki_zh/AM/wiki_{}'.format(str(i).zfill(2)), 'r', encoding = 'utf-8') as f:
        temp_jsontext = []
        for line in f:
            temp_jsontext = json.loads(line)
            x = re.sub('\s+', '', cc.convert(temp_jsontext['text']))
            jsontext.append(re.sub('[^\u4e00-\u9fa5]+', ' ', x) + '\n')

#待分詞的txt
with open('json_output.txt', 'w+') as f:
    for seg in jsontext:
        f.write(seg)            

## 分詞

In [4]:
import pandas as pd
import jieba

jieba.set_dictionary('dict.txt.big')  # 加入繁體中文詞庫

# 讀取停用詞表
stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding = 'utf-8').readlines()]

# 創建訓練用txt檔
training_file = open('training.txt', 'w', encoding = 'utf-8')
with open('json_output.txt', 'r', encoding = 'utf-8') as f :
    for line in f:
        line = line.strip('\n')
        words = jieba.cut(line, cut_all = False)
        for word in words:
            if word not in stopwords:
                training_file.write(word + ' ')
        training_file.write('\n')
training_file.close()

Building prefix dict from /Users/joanna/Desktop/new/dict.txt.big ...
Loading model from cache /var/folders/vf/krhn1yxd4qzc9qr3b221ndq40000gn/T/jieba.u6c244a92398b34d07b481a09629a76d6.cache
Loading model cost 1.084 seconds.
Prefix dict has been built successfully.


## 訓練

In [5]:
from gensim.models import word2vec

# Settings
seed = 666
sg = 0
window_size = 10
vector_size = 100
min_count = 1
workers = 8
epochs = 5
batch_words = 10000

train_data = word2vec.LineSentence('training.txt')
model = word2vec.Word2Vec(
    train_data,
    min_count=min_count,
    vector_size=vector_size,
    workers=workers,
    epochs=epochs,
    window=window_size,
    sg=sg,
    seed=seed,
    batch_words=batch_words
)

model.save('word2vec.model')



## 模型輸出

In [9]:
import numpy as np
from gensim.models import word2vec

model = word2vec.Word2Vec.load("word2vec.model")  # Load model
    
padding = np.zeros((model.vector_size,), dtype = np.float32)  # Zero padding

# Error process
try:
    #print(model.wv["小提琴"].shape)
    for item in model.wv.most_similar('小提琴',topn = 20):
        print(item)
except:
    print("此詞彙不存在模型詞表")
    #print(padding.shape)

('鋼琴', 0.8783912658691406)
('大提琴', 0.860842764377594)
('長笛', 0.835762619972229)
('中提琴', 0.8118615746498108)
('鋼琴演奏', 0.7967939376831055)
('小提琴手', 0.7888822555541992)
('彈奏', 0.7861415147781372)
('單簧管', 0.7829861044883728)
('獨奏', 0.7828001976013184)
('低音提琴', 0.7813053131103516)
('帕格尼尼', 0.7773812413215637)
('鋼琴家', 0.7763457894325256)
('演奏', 0.7750102281570435)
('小號', 0.7736620306968689)
('小提琴家', 0.7733774781227112)
('演奏家', 0.7715063095092773)
('絃樂', 0.7621024250984192)
('長號', 0.7609114050865173)
('提琴', 0.7523770928382874)
('聲樂', 0.7518982291221619)
