# 1. Training Data

In [1]:
# import 需要的 package
import re
import math
import functools
import pandas as pd
from collections import Counter

In [2]:
# 讀取TrainingData 並按照換行符號切為各篇文章
fp = open("TrainingData.txt","r")
train = fp.read().split(sep='\n')

In [3]:
# 將各篇文章內容按照tab符號切開，並存入new_train這個list
new_train = []
for i in range(0,len(train)):
    new_train.append(train[i].split(sep='\t'))

In [4]:
# 將new_train中各文章按照：位置0 = 文章編號，位置1 = 文章label，位置2 = 文章內文，各自存入三個不同list 
trainID = []
for i in range(0,len(train)):
    trainID.append(new_train[i][0])

trainlabel = []
for i in range(0,len(train)):
    trainlabel.append(new_train[i][1])

traincorpus = []
for i in range(0,len(train)):
    traincorpus.append(new_train[i][2])

In [5]:
# 讀入stopword & punctuation 並合併為Removewords
stopword = open("stopword_chinese.txt","r")
stopwords = stopword.read()
sws = stopwords.split()

punctuation = open("punctuation.txt","r")
punc = punctuation.read()
puc = punc.split()

sws.extend(puc)
Removewords = sws

# 使用re來移除Removewords
traincorp = []
for c in range(0,len(traincorpus)):
    traincorp.append(re.sub('[%s]' % re.escape(''.join(sws)),'',traincorpus[c].lower()))

# 將各文章內容按空格切成token
traincorpus_token = []
for c in range(0,len(traincorp)):
    traincorpus_token.append(traincorp[c].split())

In [6]:
# 製作TrainingData 的vocabulary
train_term = []
for i in range(0,len(traincorpus_token)):
    train_term.extend(traincorpus_token[i])
vocabulary = list(set(train_term))

In [7]:
# 使用Counter 計數並求出P(c) ，P(c)為文章屬於運動類的比率
c = Counter(trainlabel)
Pc = (c['sports']/len(trainlabel))
Pnotc = 1-Pc

In [8]:
# 製作 sports 及 politics 類別各文章的 token
sports_token = traincorpus_token[0:2000]
politics_token = traincorpus_token[2000:3500]

In [9]:
# 製作  sports 及 politics 類別的總token
sports_term = []
for i in range(0,len(sports_token)):
    sports_term.extend(sports_token[i])
politics_term = []
for i in range(0,len(politics_token)):
    politics_term.extend(politics_token[i])

In [10]:
# 使用Counter 計數，並將結果存至dictionary
s = Counter(sports_term)
p = Counter(politics_term)

s_dict = dict(s)
p_dict = dict(p)

# 2. Testing Data

In [11]:
# 讀取TestingData 並按照換行符號切為各篇文章
fp = open("TestData.txt","r")
test = fp.read().split(sep='\n')

In [12]:
# 將各篇文章內容按照tab符號切開，並存入new_test這個list
new_test = []
for i in range(0,len(test)):
    new_test.append(test[i].split(sep='\t'))

In [13]:
# 將new_test中各文章按照：位置0 = 文章編號，位置1 = 文章內文，各自存入兩個不同list 
testID = []
for i in range(0,len(test)):
    testID.append(new_test[i][0])

testcorpus = []
for i in range(0,len(test)):
    testcorpus.append(new_test[i][1])

In [14]:
# 使用re來移除Removewords
testcorp = []
for c in range(0,len(testcorpus)):
    testcorp.append(re.sub('[%s]' % re.escape(''.join(sws)),'',testcorpus[c].lower()))

# 將各文章內容按空格切成token
testcorpus_token = []
for c in range(0,len(testcorp)):
    testcorpus_token.append(testcorp[c].split())

In [15]:
# 定義function 文章的每個字使用training結果的機率值 ， 因為值都小於零所以取log讓數值不會越乘越小
def Pc_of_d(document):
    Pcd = []
    for i in document:
        if i in s_dict.keys():
            Pcd.append(math.log(((s_dict[i]+1)/(len(sports_term)+len(vocabulary)))))
        else:
            Pcd.append(math.log(((0+1)/(len(sports_term)+len(vocabulary)))))
    return Pcd

def Pnotc_of_d(document):
    Pnotcd = []
    for i in document:
        if i in p_dict.keys():
            Pnotcd.append(math.log(((p_dict[i]+1)/(len(politics_term)+len(vocabulary)))))
        else:
            Pnotcd.append(math.log(((0+1)/(len(politics_term)+len(vocabulary)))))
    return Pnotcd

In [16]:
# 對每篇文章使用function計算它的每個字，屬於sports及不屬於sports的log機率值
new_Pcd = []
for i in range(0,len(testcorpus_token)):
    box = Pc_of_d(testcorpus_token[i])
    new_Pcd.append(box)

new_Pnotcd = []
for i in range(0,len(testcorpus_token)):
    box = Pnotc_of_d(testcorpus_token[i])
    new_Pnotcd.append(box)

In [17]:
# 將每篇文章的每個字的log機率值相加，且加上屬於sports及不屬於sports的log機率值
Pc_d = []
for i in range(0,len(new_Pcd)):
    Pc_d.append(math.log(Pc) + functools.reduce(lambda x,y : x+y,new_Pcd[i]))
    
Pnotc_d = []
for i in range(0,len(new_Pnotcd)):
    Pnotc_d.append(math.log(Pnotc) + functools.reduce(lambda x,y : x+y,new_Pnotcd[i]))

In [18]:
# 用if-else邏輯判斷文章屬於sports或politics類別
result_label = []
for i in range(0,len(Pc_d)):
    if Pc_d[i] > Pnotc_d[i]:
        result_label.append('sports')
    else:
        result_label.append('politics')

In [19]:
# 將判斷結果用pandas存出來
d = {'DocumentID' : pd.Series(testID),
     'DocumentLabel' : pd.Series(result_label)}
df = pd.DataFrame(d)
df.to_csv('test_result.csv',header=True)