In [150]:
!python --version

Python 3.7.11


In [151]:
!pip install -q -U pip
!pip install -q numpy
!pip install -q pandas
!pip install -q ckiptagger
!pip install -q tqdm
!pip install -q tensorflow==1.14.0
!pip install -q ipywidgets



In [152]:
import pandas as pd
import numpy as np

from ckiptagger import WS, POS
from tqdm.notebook import tqdm

In [153]:
df_train = pd.read_csv('news_clustering_train.tsv', sep='\t')
df_test = pd.read_csv('news_clustering_test.tsv', sep='\t')

In [154]:
df_train

Unnamed: 0,index,class,title
0,0,體育,亞洲杯奪冠賠率：日本、伊朗領銜 中國竟與泰國並列
1,1,體育,9輪4球本土射手僅次武磊 黃紫昌要搶最強U23頭銜
2,2,體育,如果今年勇士奪冠，下賽季詹姆斯何去何從？
3,3,體育,超級替補！科斯塔本賽季替補出場貢獻7次助攻
4,4,體育,騎士6天里發生了啥？從首輪搶七到次輪3-0猛龍
...,...,...,...
1795,1795,遊戲,LOL：麻辣香鍋韓服Rank不合成打野刀？電刀巨魔新套路連勝中
1796,1796,遊戲,穩住，我們能贏！因為我們擁有這種強大的力量
1797,1797,遊戲,騰訊是怎樣毀掉《鬥戰神》這款可能成神的作品的？
1798,1798,遊戲,LOL你不知道的黑科技打法！


In [155]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

test_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
test_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [156]:
all_news_class = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

# 斷詞 + POS

In [157]:
from ckiptagger import data_utils
data_utils.download_data_gdown("./")

Downloading...
From: https://drive.google.com/uc?id=1efHsY16pxK0lBD2gYCgCTnv1Swstq771
To: /content/data.zip
1.88GB [00:14, 128MB/s]


In [158]:
ws = WS('./data/')
pos = POS('./data/')



In [159]:
train_title_cuts = {}
for index, title in tqdm(train_titles.items()):
    # YOUR CODE HERE
    word_s = ws([title], sentence_segmentation=True)
    word_p = pos(word_s)  # tag part of speech from the word_s variable
    # END YOUR CODE
    train_title_cuts[index] = list(zip(word_s[0], word_p[0]))

  0%|          | 0/1800 [00:00<?, ?it/s]

In [160]:
test_title_cuts = {}
for index, title in tqdm(test_titles.items()):
    # YOUR CODE HERE
    word_s = ws([title]) # pass list to ws to parse the entire title
    word_p = pos(word_s)
    # END YOUR CODE
    test_title_cuts[index] = list(zip(word_s[0], word_p[0]))

  0%|          | 0/600 [00:00<?, ?it/s]

In [161]:
train_title_cuts[120]

[('國腳', 'Na'),
 ('張呈棟', 'Nb'),
 ('：', 'COLONCATEGORY'),
 ('從', 'D'),
 ('沒', 'D'),
 ('想', 'VE'),
 ('過', 'Di'),
 ('自己', 'Nh'),
 ('會', 'D'),
 ('出', 'VC'),
 ('一', 'Neu'),
 ('本', 'Nf'),
 ('書', 'Na')]

In [162]:
train_title_cuts[120][1][1]

'Nb'

# Bag of Words (BOW)

In [189]:
word2index = {}
index2word = {}
# 產生字與index對應的關係
# YOUR CODE HERE
n = 0
for title in train_title_cuts:
  for word, pos in train_title_cuts[title]:
    if word in word2index:
      continue #avoid repeating words
    word2index[word] = n
    index2word[n] = word
    n += 1
# END YOUR CODE

In [190]:
word2index['溫暖']

1512

In [165]:
index2word[1520]

'許'

In [192]:
def get_bow_vector(pairs, word2index):
    # YOUR CODE HERE
    vector = np.zeros(len(word2index))
    for word, flag in pairs: # +1 at the dictionary entry where the word is present 
    # (and that calculating cosine similarity has to be arrays of the same length)
      if word in word2index:
        vector[word2index[word]] += 1
    # END YOUR CODE
    return vector

In [184]:
test = get_bow_vector(train_title_cuts[120], word2index)

In [185]:
test, len(test)

(array([0., 0., 0., ..., 0., 0., 0.]), 6662)

# 排除較無意義的詞性

In [186]:
pos_analysis = {}
for _, pairs in train_title_cuts.items():
    for word, flag in pairs:
        if flag not in pos_analysis:
            pos_analysis[flag] = set()
        pos_analysis[flag].add(word)

for flag, words in pos_analysis.items():
    print(flag, ':', list(words)[:100])
    print('=======================')

Nb : ['瓦基弗', '聯盟里', '克洛普', '黃紫昌', '萊昂納德', '騎士隊', '龔方雄', '·瞻', '奧恰洛夫', '詹姆斯', '網易', '碧生源', '奧運', '里皮', '劉集', '李盈瑩', '科勒·卡戴珊', '梅西', '傑森', '宇通', '寶二', '呂布', '萊萬', '楊智復', '朱', '纏中', '郭躍', '唐培科', '泰拳王西', '哪吒', '劉國梁', '奧拉朱旺', '康德', '馬里奧', '武當', '東決', '弗格森', '夢琪', '安徒恩比盧克', '姚振華', '西游', '德羅贊', '約翰塞納', '宋清輝', '猛', '卡帥', '微醫', '德甲', '阿里舜宇', '梅西納', '劉歡', 'Dota2', '德比', '萊萬特', '殺里', '沙特', '巴薩', '隆多', '丘誠', '胤祥', '亨德森', '何雯娜', '老詹', '多浪', '楊柳夏', '湯米', '切爾西', '武磊', '羅森', '皇馬', '許昕', '密子君', '小白', '何享健', '雅桑克萊', '微眾', '馬克', '韓信', '李大霄', '群里', '聯發科', '桑德羅', '朱嘯虎', '姚明', 'S11', '高通', '刷安徒恩', '力哥', '魅族', '哈登', '瑞和寶', '維亞利', '易', '伯克希爾', '李秋平', '李嘉誠', '石川佳純', '泰山', '藍星', '小詹皇']
Na : ['影響', '宿敵', '流氓', '風格', '增長點', '級頭', '帝王蟹', '柑橘', '馬桶', '夢淚', 'SWITCH版', '會計', '大V', '下場', '龍果', '龍', '小麥', '觀眾', '勝地', '島嶼', '賽點', '里皮', '環境', '季後賽', '山', '素能', '操盤手', '風險', '紅糖', '滿分', '牛皮菜', '青玉薩', '敗筆', '星陣', '優勢', '兒子', '大集', '殤', '戰神', '大胃王', '絕殺球', '蜂農', '所在', '子彈', '利潤率', '開心果', '手', '女乒', '消息', '自行車', '遭遇', '痢疾', '

|         Type        |     Description    |
|:-------------------:|:------------------:|
| A                   | 非謂形容詞         |
| Caa                 | 對等連接詞         |
| Cab                 | 連接詞，如：等等   |
| Cba                 | 連接詞，如：的話   |
| Cbb                 | 關聯連接詞         |
| D                   | 副詞               |
| Da                  | 數量副詞           |
| Dfa                 | 動詞前程度副詞     |
| Dfb                 | 動詞後程度副詞     |
| Di                  | 時態標記           |
| Dk                  | 句副詞             |
| DM                  | 定量式             |
| I                   | 感嘆詞             |
| Na                  | 普通名詞           |
| Nb                  | 專有名詞           |
| Nc                  | 地方詞             |
| Ncd                 | 位置詞             |
| Nd                  | 時間詞             |
| Nep                 | 指代定詞           |
| Neqa                | 數量定詞           |
| Neqb                | 後置數量定詞       |
| Nes                 | 特指定詞           |
| Neu                 | 數詞定詞           |
| Nf                  | 量詞               |
| Ng                  | 後置詞             |
| Nh                  | 代名詞             |
| Nv                  | 名物化動詞         |
| P                   | 介詞               |
| T                   | 語助詞             |
| VA                  | 動作不及物動詞     |
| VAC                 | 動作使動動詞       |
| VB                  | 動作類及物動詞     |
| VC                  | 動作及物動詞       |
| VCL                 | 動作接地方賓語動詞 |
| VD                  | 雙賓動詞           |
| VF                  | 動作謂賓動詞       |
| VE                  | 動作句賓動詞       |
| VG                  | 分類動詞           |
| VH                  | 狀態不及物動詞     |
| VHC                 | 狀態使動動詞       |
| VI                  | 狀態類及物動詞     |
| VJ                  | 狀態及物動詞       |
| VK                  | 狀態句賓動詞       |
| VL                  | 狀態謂賓動詞       |
| V_2                 | 有                 |
|                     |                    |
| DE                  | 的之得地           |
| SHI                 | 是                 |
| FW                  | 外文               |
|                     |                    |
| COLONCATEGORY       | 冒號               |
| COMMACATEGORY       | 逗號               |
| DASHCATEGORY        | 破折號             |
| DOTCATEGORY         | 點號               |
| ETCCATEGORY         | 刪節號             |
| EXCLAMATIONCATEGORY | 驚嘆號             |
| PARENTHESISCATEGORY | 括號               |
| PAUSECATEGORY       | 頓號               |
| PERIODCATEGORY      | 句號               |
| QUESTIONCATEGORY    | 問號               |
| SEMICOLONCATEGORY   | 分號               |
| SPCHANGECATEGORY    | 雙直線             |
| WHITESPACE          | 空白               |

In [169]:
def get_bow_vector_with_selection(pairs, word2index):
    excluded_flags = [
        # 根據以上列舉出來的文字以及詞性表，請列出想要排除的詞性
        # YOUR CODE HERE
        "COLONCATEGORY", "COMMACATEGORY", "DASHCATEGORY", "DOTCATEGORY", "ETCCATEGORY", "EXCLAMATIONCATEGORY", "PARENTHESISCATEGORY",
        "PAUSECATEGORY", "QUESTIONCATEGORY", "SEMICOLONCATEGORY", "SPCHANGECATEGORY", "WHITESPACE",
        "Caa", "Cab", "Cba", "Cbb",
        "Di", "Dk", "DM", 
        "I", "DE", "SHI", "P", "T",
        "Nep", "Ng", "Nh", "V_2"
        # END YOUR CODE
    ]
    vector = np.zeros(len(word2index))
    for word, flag in pairs:
        if word in word2index and flag not in excluded_flags:
            vector[word2index[word]] += 1
    return vector

# Cosine Similarity

In [193]:
import sklearn.metrics.pairwise
def cosine_similarity(bow1, bow2):
    # YOUR CODE HERE
    len_bow1 = (bow1 **2).sum() **(1/2)
    len_bow2 = (bow2 **2).sum() **(1/2)
    similarity = np.sum(bow1*bow2) / (len_bow1*len_bow2)
    # END YOUR CODE
    return similarity

In [194]:
bow1 = get_bow_vector(train_title_cuts[100], word2index)
bow2 = get_bow_vector(train_title_cuts[130], word2index)
cosine_similarity(bow1, bow2)

0.08703882797784893

In [173]:
train_title_cuts[100]

[('山東', 'Nc'),
 ('魯能', 'Nb'),
 ('有沒有', 'D'),
 ('可能', 'D'),
 ('拿到', 'VC'),
 ('今年', 'Nd'),
 ('的', 'DE'),
 ('中', 'A'),
 ('超', 'A'),
 ('冠軍', 'Na'),
 ('？', 'QUESTIONCATEGORY')]

In [174]:
train_title_cuts[130]

[('NBA', 'Nb'),
 ('和', 'Caa'),
 ('CBA', 'FW'),
 ('差距', 'Na'),
 ('在', 'P'),
 ('哪裡', 'Ncd'),
 ('？', 'QUESTIONCATEGORY'),
 ('6', 'Neu'),
 ('張', 'Nf'),
 ('圖', 'VF'),
 ('一目瞭然', 'VH'),
 ('！', 'EXCLAMATIONCATEGORY')]

# Group mean vector

In [175]:
group_vectors = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(train_title_cuts.items()):
    vector = get_bow_vector_with_selection(pairs, word2index)
    news_class = train_classes[index]
    group_vectors[news_class].append(vector)

group_mean_vector = {}
for news_class, vectors in group_vectors.items():
    group_mean_vector[news_class] = np.mean(vectors, axis=0)
group_mean_vector

{'旅遊': array([0., 0., 0., ..., 0., 0., 0.]),
 '科技': array([0., 0., 0., ..., 0., 0., 0.]),
 '財經': array([0., 0., 0., ..., 0., 0., 0.]),
 '農業': array([0., 0., 0., ..., 0., 0., 0.]),
 '遊戲': array([0.        , 0.        , 0.        , ..., 0.00333333, 0.00333333,
        0.00333333]),
 '體育': array([0.04      , 0.00333333, 0.        , ..., 0.        , 0.        ,
        0.        ])}

# Group mean vector: 測試

In [176]:
classification = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(test_title_cuts.items()):
    vector = get_bow_vector_with_selection(pairs, word2index)
    if np.sum(np.square(vector)) == 0:
        continue

    max_val = -2.0
    max_class = None
    for news_class, ref_vector in group_mean_vector.items():
        val = cosine_similarity(ref_vector, vector)
        if val > max_val:
            max_class = news_class
            max_val = val

    classification[max_class].append(index)

In [177]:
from collections import Counter

for group, ids in classification.items():
    counter = Counter([test_classes[id] for id in ids])
    print('predict', group, ': ', counter)

predict 體育 :  Counter({'體育': 72, '遊戲': 10, '財經': 6, '農業': 6, '科技': 4, '旅遊': 4})
predict 財經 :  Counter({'財經': 68, '科技': 20, '農業': 8, '遊戲': 6, '旅遊': 5, '體育': 4})
predict 科技 :  Counter({'科技': 59, '財經': 14, '體育': 12, '農業': 6, '遊戲': 5, '旅遊': 2})
predict 旅遊 :  Counter({'旅遊': 73, '農業': 8, '財經': 5, '科技': 2, '體育': 1, '遊戲': 1})
predict 農業 :  Counter({'農業': 68, '旅遊': 9, '科技': 6, '體育': 4, '財經': 4, '遊戲': 2})
predict 遊戲 :  Counter({'遊戲': 76, '科技': 8, '體育': 6, '旅遊': 5, '財經': 3, '農業': 3})
