In [3]:
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
import re
import matplotlib.pyplot as plt
from pylab import rcParams
from gensim.models import word2vec
from gensim.models import Word2Vec

%matplotlib inline

In [2]:
# Customizing plots with style 
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 2
plt.style.use('ggplot')

# 給一個關鍵字 tag，並從中找到相像關鍵字

In [118]:
# 資料載入
# 回文
comment_data = pd.read_csv("data/big_data/楊俊瀚_comment_0.csv", usecols=["POST_TITLE", "POST_CONTENT"], encoding="utf8")
# 主文
post_data = pd.read_csv("data/big_data/楊俊瀚_post_0.csv", usecols=["POST_TITLE", "POST_CONTENT"], encoding="utf8")

In [119]:
print(comment_data.shape)
print(post_data.shape)

(101000, 2)
(101, 2)


In [120]:
# 主文長度排序
post_data["POST_CONTENT"].str.len().sort_values(ascending=False).head(10)

100    8137
46     1234
49     1184
99     1118
96     1113
50     1070
47     1067
27     1041
48     1032
81     1023
Name: POST_CONTENT, dtype: int64

In [129]:
# 評論長度排序
comment_data["POST_CONTENT"].str.len().sort_values(ascending=False).head(10)

47002    164
46001    164
2010     164
68111    164
43041    164
41130    164
1043     164
348      164
7677     160
23632    160
Name: POST_CONTENT, dtype: int64

In [121]:
def filter_outlier_by_article(df, std_num):
    df = df.dropna()
    std = df["POST_CONTENT"].str.len().std()
    mean = df["POST_CONTENT"].str.len().mean()
    upper = mean + std_num*std
    return df.loc[df["POST_CONTENT"].str.len()<upper,:]

In [122]:
# 刪除離群值文本記錄
post_data = filter_outlier_by_article(post_data, 1)
comment_data = filter_outlier_by_article(comment_data, 1)
print("post data length: {}, comment data length: {}".format(len(post_data), len(comment_data)))

post data length: 100, comment data length: 100279


In [124]:
comment_data = comment_data.loc[comment_data["POST_TITLE"].str.contains("楊俊瀚"),:]

In [132]:
raw_data = post_data.append(comment_data)

In [134]:
# 資料前處理
raw_data = raw_data.dropna()
content_list = raw_data["POST_CONTENT"].values.tolist()

In [136]:
len(content_list)

26014

In [216]:
content_list[:5]

['可惜了',
 '「銀」得可惜！我田徑好手楊俊瀚在雅加達亞運男子兩百公尺拿下銀牌，雖然成績與金牌得主日本選手小池祐貴同為廿秒二三，但經電腦判定落敗，楊俊瀚自認做到九十九分，「如果再好一點點，或許我就是以千分之一贏的那一個。」楊俊瀚激動落淚哭完說：上天給的結果楊俊瀚起跑出彎道後一度領先，但隔壁跑道的小池祐貴追上並列，雙方纏鬥到終點線，最終透過影像判定，算到小數點後三位才落敗，楊俊瀚激動落淚，不過哭完就恢復心情，他說：「如果有一點差距，代表我們不夠努力，輸千分位，就是上天給的結果。」等了64年我第一面男子200公尺獎牌楊俊瀚的教練陶武訓在一九九四年廣島亞運曾跑出第四名佳績，這次楊完成賽前向教練說「目標至少要打破你」的宣言，拿下中華隊自參加亞運以來，等了六十四年的第一面男子兩百公尺獎牌。但楊俊瀚卻心情複雜，「我辦到了，但只是九十九分，為什麼不做到一百分？」「如果再好一點點，或許我就是贏的那個」差了哪一分？楊俊瀚說：「如果心理調適再好一點、休息再好一點，什麼都能做好一點點，說不定我是贏他千分位的那一個。」楊俊瀚透露比賽前一天失眠，整晚幾乎沒睡，一路焦慮到賽前熱身。教練告訴楊俊瀚：「再怎麼準備也達不到百分百，不如把握好現在的狀態。」他雖然聽見了，卻進不到心裡，直到想起去年世大運也是狀況不好，最後在一百公尺破全國紀錄，楊俊瀚說：「當下心情就舒坦了，應該要相信教練。」加上楊俊瀚這面銀牌，中華代表團昨天總計進帳三銀一銅，黃亭茵在自由車個人全能賽摘銀，鄭竹玲在軟網女單不敵日本女將高橋乃綾拿下銀牌，楊勇緯則在柔道男子六十公斤級從敗部復活奪銅；中華拳擊女將林郁婷則逆轉打進四強，至少銅牌起跳。',
 '我國100公尺全國紀錄保持人楊俊瀚，26日晚間9點25出賽100公尺決賽，跑出和準決賽同樣的10秒17最終排名第五無緣獎牌，金牌則是跑出9秒92破大會的中國蘇炳添。100公尺預賽階段，楊俊瀚跑出10秒13列在分組第一，同時也是預賽第一、順利晉級下一輪，來到準決賽，楊俊瀚雖然開跑時稍微落後，不過後段加速，跑出10秒17分組第二、準決賽第三順利闖進決賽。決賽階段，楊俊瀚再度跑出10秒17，蘇炳添以9秒92摘金，卡達TosinOgunode跑出10秒00排第二，日本的山縣亮太則以同樣秒數排第三楊俊瀚去年在全大運、台北世大運先後刷新我國100公尺紀錄，接著今年的日本大學公開賽，先在資格賽中飆出10

In [217]:
content_list = [doc.strip().replace(' ', '') for doc in content_list]

In [218]:
# 指定濾掉的詞性，並過濾掉其他詞
def filter_part_speech(pos_list, part_speech_list):
    return list(filter(lambda x: x.flag not in pos_list, part_speech_list))

In [219]:
def filter_not_chinese_word(document):
# 只取中文
    try:
        document = "".join(re.findall(r"[\u4e00-\u9fa5]+", document))
        return document
    except Exception as e:
        print("{}, index {}".format(str(e), index))

In [220]:
def tokenize_document(doc, stop_word_list):
    preprocessed_document = jieba.cut(doc)
    # 去除保留字
    preprocessed_document = list(filter(lambda x: x not in stop_word_list, preprocessed_document))
    return preprocessed_document

In [221]:
# 用 pos 會超慢
def tokenize_document_by_pos(doc, stop_word_list, filter_pos):
    part_speech_list = pseg.cut(doc)
    # 去除保留字
    part_speech_list = list(filter(lambda x: x.word not in stop_word_list, part_speech_list))
    # 篩選字詞 ['n', 'x', 'n', 'ng', 'nr', 'ns']
    part_speech_list = filter_part_speech(filter_pos, part_speech_list)
    preprocessed_document = [part_speech.word for part_speech in part_speech_list]
    return preprocessed_document

In [225]:
# 用來存放分詞後的結果
preprocessed_documents = []
# stopword
with open("data/jieba_dict/stopwords.txt") as stop_words:
    stop_word_list = [stop_word.strip() for stop_word in stop_words]
# 支援繁體中文較好的詞庫
jieba.set_dictionary("data/jieba_dict/dict.txt.big")
jieba.load_userdict("data/jieba_dict/中央機構.dict")
jieba.load_userdict("data/jieba_dict/名人錄.dict")
jieba.load_userdict("data/jieba_dict/專有名詞.dict")
jieba.load_userdict("data/jieba_dict/縣市區鄉鎮.dict")

for index, document in enumerate(content_list, 0):
    if index % 2000 == 0:
        print("current document index:{}".format(index))
    # 只取中文    
    document = filter_not_chinese_word(document)
    # preprocessed_document = tokenize_document_by_pos(document, stop_word_list, ['n', 'x', 'n', 'ng', 'nr', 'ns'])
    preprocessed_document = tokenize_document(document, stop_word_list)
    preprocessed_documents.append(preprocessed_document)

Building prefix dict from /Users/Mark1002/Desktop/project/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /var/folders/dw/m2zgs87j3x19nl8mnfy3fs8c0000gn/T/jieba.ud2b054c4d13e51557150f7d36ba5f4d0.cache
Loading model cost 1.448 seconds.
Prefix dict has been built succesfully.


current document index:0
current document index:2000
current document index:4000
current document index:6000
current document index:8000
current document index:10000
current document index:12000
current document index:14000
current document index:16000
current document index:18000
current document index:20000
current document index:22000
current document index:24000
current document index:26000


In [257]:
preprocessed_documents[-5]

['楊',
 '肩膀',
 '先過',
 '規則',
 '運動員',
 '抵達',
 '終點',
 '名次',
 '時應',
 '軀幹',
 '包括',
 '頭頸',
 '臂手',
 '腿',
 '腳',
 '部分',
 '到達',
 '終點']

In [245]:
model = Word2Vec(
    preprocessed_documents, 
    size=250,
    min_count=3, 
    window=10,
)
model.train(preprocessed_documents, total_examples=len(preprocessed_documents), epochs=10)

(872282, 1030490)

In [280]:
model.wv.most_similar("最佳", topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('紀錄', 0.8223675489425659),
 ('生涯', 0.7949206829071045),
 ('一口氣', 0.7843143939971924),
 ('繳出', 0.7825567722320557),
 ('卡達', 0.771132230758667),
 ('突破', 0.7706197500228882),
 ('超扯', 0.7374297380447388),
 ('全國紀錄', 0.7365154027938843),
 ('曼谷', 0.7339699268341064),
 ('打破', 0.7331404089927673)]

In [281]:
model.wv.most_similar("夠強", topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('要拿牌', 0.8486579656600952),
 ('第一名', 0.8454180955886841),
 ('破紀錄', 0.8288664817810059),
 ('逮丸隊', 0.8156543970108032),
 ('栽培', 0.8140100240707397),
 ('改善', 0.8105578422546387),
 ('節奏', 0.7979242205619812),
 ('抓好', 0.7961492538452148),
 ('共識', 0.7950407266616821),
 ('那間', 0.7841838598251343)]

# 載入預訓練好的模型

In [4]:
# 載入預訓練好的模型
wv_from_bin = Word2Vec.load("data/Word2Vec_v1.4/w2v.model.bin")

In [7]:
expansion_list = wv_from_bin.wv.most_similar("喜歡")

  if np.issubdtype(vec.dtype, np.int):


In [8]:
[word for word, points in expansion_list]

['討厭', '我喜歡', '不喜歡', '很愛', '喜歡的', '有魅力', '不愛', '鍾愛', '愛看', '喜歡你']

## 資料集規模就是一切