In [262]:
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
import re
import matplotlib.pyplot as plt
from pylab import rcParams
from gensim.models import word2vec

%matplotlib inline

In [252]:
# Customizing plots with style 
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 2
plt.style.use('ggplot')

In [406]:
# 資料載入
user_list = ["電腦王阿達", "笑波子", "Fixiphone-阿倫的愛瘋修"]
doc_types = ["post", "comment"]

comment_file_path = "data/big_data/pos/" + user_list[2] + "_" + doc_types[1] + "_1" + ".csv"
post_file_path = "data/big_data/pos/" + user_list[2] + "_" + doc_types[0] + "_1" + ".csv"

comment_data = pd.read_csv(comment_file_path, encoding="utf8")
post_data = pd.read_csv(post_file_path, encoding="utf8")

In [408]:
# 找到特定的作者，文章 title
comment_data = comment_data.loc[(comment_data["POST_AUTHOR"]=="Fixiphone-阿倫的愛瘋修") & (comment_data["POST_TITLE"].str.contains("iPhone")),:]

In [429]:
comment_data.shape

(29, 11)

In [441]:
post_data.loc[:,["POST_TITLE"]]

Unnamed: 0,POST_TITLE
0,今天天氣不好...我
1,好，512不存在 結
2,iPhone 8 P
3,iphone8 /
4,本人因為上午做重訓.
5,今日直播送獎品(不能
6,今天我累了一天，餓了
7,【我是胖胖
8,上禮拜的<<台灣好鯛
9,<<回憶的保存>>


In [379]:
post_data["POST_TITLE"].str.contains("iPhone").head()

0    False
1     True
2    False
3    False
4    False
Name: POST_TITLE, dtype: bool

In [383]:
(post_data["POST_AUTHOR"]=="電腦王阿達") & post_data["POST_TITLE"].str.contains('iphone')

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
31867    False
31868    False
31869    False
31870    False
31871    False
31872    False
31873    False
31874    False
31875    False
31876    False
31877    False
31878    False
31879    False
31880    False
31881    False
31882    False
31883    False
31884    False
31885    False
31886    False
31887    False
31888    False
31889    False
31890    False
31891    False
31892    False
31893    False
31894    False
31895    False
31896    False
Length: 31897, dtype: bool

In [381]:
post_data["POST_TITLE"].str.contains('iphone').head()

0    False
1    False
2    False
3    False
4    False
Name: POST_TITLE, dtype: bool

In [272]:
# 主文長度排序
post_data["POST_CONTENT"].str.len().sort_values(ascending=False).head(5)

49    2574
48    2323
28    2208
19    1847
46     120
Name: POST_CONTENT, dtype: int64

In [273]:
# 資料前處理
post_data = post_data.dropna()

post_content_list = post_data["POST_CONTENT"].values.tolist()

In [331]:
# 指定詞性，並過濾掉其他詞
def filter_part_speech(pos_list, part_speech_list):
    return list(filter(lambda x: x.flag in pos_list, part_speech_list))

In [339]:
# 用來存放分詞後的結果
preprocessed_documents = []
# stopword
with open("data/jieba_dict/stopwords.txt") as stop_words:
    stop_word_list = [stop_word.strip() for stop_word in stop_words]
# 支援繁體中文較好的詞庫
jieba.set_dictionary("data/jieba_dict/dict.txt.big")
jieba.set_dictionary
for document in post_content_list:
    # 只取中文
    document = "".join(re.findall(r"[\u4e00-\u9fa5]+", document))
    part_speech_list = list(pseg.cut(document))
    # 去除保留字
    part_speech_list = list(filter(lambda x: x.word not in stop_word_list, part_speech_list))
    # 篩選字詞
    part_speech_list = filter_part_speech(['a', 'v'], part_speech_list)
    preprocessed_documents.append([part_speech.word for part_speech in part_speech_list])

Building prefix dict from /Users/Mark1002/Desktop/project/python/nlp-experiment/data/jieba_dict/dict.txt.big ...
Loading model from cache /var/folders/dw/m2zgs87j3x19nl8mnfy3fs8c0000gn/T/jieba.ud2b054c4d13e51557150f7d36ba5f4d0.cache
Loading model cost 1.518 seconds.
Prefix dict has been built succesfully.


In [341]:
model = word2vec.Word2Vec(preprocessed_documents, min_count=1, window=10, sg=1)

In [347]:
model.most_similar("輸")

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('作用', 0.39627617597579956),
 ('排除', 0.3559700846672058),
 ('分散', 0.3325254023075104),
 ('無', 0.3190414011478424),
 ('搞', 0.30995699763298035),
 ('譽為', 0.3071236312389374),
 ('來自', 0.30221399664878845),
 ('迷', 0.29458457231521606),
 ('分成', 0.2933948040008545),
 ('最好', 0.2827370762825012)]

In [204]:
post_data.isnull().sum()

URL             0
POST_BOARD      0
POST_AUTHOR     0
POST_TITLE      0
PUSH_AUTHOR     0
PUSH_STATE      0
POST_CONTENT    0
POST_DATE       0
POST_TIME       0
POST_IP         0
PUSH_NO         0
dtype: int64