In [45]:
import numpy as np
import pandas as pd

np.random.seed(0)
df = pd.read_csv('./gepd.csv',encoding = "ISO-8859-1")  # 

df = df.reindex(np.random.permutation(df.index))  # 打亂 df 的排序 np.random.permutation()

#df.head(3)

Unnamed: 0,article,rating
54,Defensive drivers look in their driving mirror...,2
146,Egypt's Antiquities Ministry announced that it...,2
63,Don't miss this once-in-a-lifetime opportunity...,2


In [47]:
# 詞袋模型: 句子 -> 單字 -> 詞袋(稀疏矩陣)
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

In [48]:
# 計算 TF-IDF, 使用 L2正規化 
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
#print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

In [49]:
# 文件清理
# 用正規表示式
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # 清除 HTML
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # 清除 標點符號
    text = re.sub('[\W]+', ' ', text.lower()) + \
           ' '.join(emoticons).replace('-', '') # 非字元符號, - 
    return text

In [50]:
# 所有文本清除前
#print(df['article'].head(20))

In [51]:
# 所有文本清除後
df['article'] = df['article'].apply(preprocessor)
#print(df['article'].head(20))

54     defensive drivers look in their driving mirror...
146    egypt s antiquities ministry announced that it...
63     don t miss this once in a lifetime opportunity...
55     according to newspaper reports flooding in the...
125    in modern hospitals the most popular treatment...
100    some of curt wilder s books have been widely r...
7      elementary school students don t have as much ...
155    the european union s regulator sent microsoft ...
104    the digital age is dawning and that s good new...
89     in public service lectures fire fighters not o...
138    former nazi ss officer oskar groening known as...
143    sometimes being friendly and flashing a big sm...
5      the student raced out of the classroom and bum...
97     bruce read two film scripts he didn t enjoy on...
163    chiloe a beautiful island chain lies off the c...
93     the current economic recession is threatening ...
33     the babysitter has been with the child since t...
18     dear grandma and grandpa

In [52]:
# 文件轉為字符
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):  # 方法1. 空白字元斷字
    return text.split()
def tokenizer_porter(text): # 方法2. 字詞 -> 字根
    return [porter.stem(word) for word in text.split()]

In [53]:
# 命令提示字元 執行 pip install nltk
# 處理停用字(常見但又無用的字, is and or...)
import nltk
nltk.download('stopwords')

# 使用英文停用字集
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:,'article'], df.loc[:, 'rating'], test_size=0.2, random_state=0)

In [55]:
#X_train.shape
#y_train.shape
#X_test.shape
#y_test.shape

(35,)

In [83]:
import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

    
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,
                             stop_words=stop,
                              tokenizer=tokenizer_porter,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
fit_svm_rbf = classifier_rbf.fit(train_vectors, y_train)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(test_vectors)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
fit_svm_linear = classifier_linear.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
fit_svm_liblinear = classifier_liblinear.fit(train_vectors, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(test_vectors)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1



# Perform classification with naive_bayes, kernel=MultinomialNB
#clf = MultinomialNB()
classifier_MultinomialNB = MultinomialNB(alpha=1.0, fit_prior=True)
t0 = time.time()
fit_MultinomialNB = classifier_MultinomialNB.fit(train_vectors, y_train)
t1 = time.time()
prediction_MultinomialNB = classifier_MultinomialNB.predict(test_vectors)
t2 = time.time()
time_MultinomialNB_train = t1-t0
time_MultinomialNB_predict = t2-t1


# Perform classification with naive_bayes, kernel=BernoulliNB
#clf = BernoulliNB()
classifier_BernoulliNB = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
t0 = time.time()
fit_BernoulliNB = classifier_BernoulliNB.fit(train_vectors, y_train)
t1 = time.time()
prediction_BernoulliNB = classifier_BernoulliNB.predict(test_vectors) 
t2 = time.time()
time_BernoulliNB_train = t1-t0
time_BernoulliNB_predict = t2-t1


# Perform classification with KNN
classifier_neigh = KNeighborsClassifier(n_neighbors=3)
t0 = time.time()
fit_KNN = classifier_neigh.fit(train_vectors, y_train)
t1 = time.time()
prediction_KNN =classifier_neigh.predict(test_vectors)
t2 = time.time()
time_KNN_train = t1-t0
time_KNN_predict = t2-t1



# Perform classification with LogisticRegression
classifier_Logistic = LogisticRegression()
t0 = time.time()
Logistic_KNN = classifier_Logistic.fit(train_vectors, y_train)
t1 = time.time()
prediction_Logistic = classifier_Logistic.predict(test_vectors)
t2 = time.time()
time_Logistic_train = t1-t0
time_Logistic_predict = t2-t1



# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(y_test, prediction_rbf))

print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear))

print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear))

print("Results for MultinomialNB")
print("Training time: %fs; Prediction time: %fs" % (time_MultinomialNB_train, time_MultinomialNB_predict))
print(classification_report(y_test, prediction_MultinomialNB))

print("Results for BernoulliNB")
print("Training time: %fs; Prediction time: %fs" % (time_BernoulliNB_train, time_BernoulliNB_predict))
print(classification_report(y_test, prediction_BernoulliNB))

print("Results for KNN")
print("Training time: %fs; Prediction time: %fs" % (time_KNN_train, time_KNN_predict))
print(classification_report(y_test, prediction_KNN))

print("Results for LogisticRegression")
print("Training time: %fs; Prediction time: %fs" % (time_Logistic_train, time_Logistic_predict))
print(classification_report(y_test, prediction_Logistic))

Results for SVC(kernel=rbf)
Training time: 0.006412s; Prediction time: 0.000000s
             precision    recall  f1-score   support

          1       0.00      0.00      0.00        10
          2       0.43      1.00      0.60        15
          3       0.00      0.00      0.00        10

avg / total       0.18      0.43      0.26        35

Results for SVC(kernel=linear)
Training time: 0.005343s; Prediction time: 0.001069s
             precision    recall  f1-score   support

          1       0.86      0.60      0.71        10
          2       0.60      1.00      0.75        15
          3       1.00      0.30      0.46        10

avg / total       0.79      0.69      0.65        35

Results for LinearSVC()
Training time: 0.001069s; Prediction time: 0.001069s
             precision    recall  f1-score   support

          1       0.86      0.60      0.71        10
          2       0.60      0.80      0.69        15
          3       0.50      0.40      0.44        10

avg / to

  'precision', 'predicted', average, warn_for)
