In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [2]:
#資料前處理
#讀csv檔
df = pd.read_csv("Reviews.csv")

# 取前 1 萬筆資料，僅保留 "Text"、"Score"兩個欄位
df = df[['Text', 'Score']].head(10000)

In [3]:
# 將 "Score" 欄位大於等於 4 的轉成 1，其他轉成 0
df['Score'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

# 將 "Text" 欄位內的文字利用分割符號切割 (這裡以空白為分割符號，根據實際情況可以更改)
df['Text'] = df['Text'].apply(lambda x: x.split())

# 查看處理後的資料
print(df.head())

                                                Text  Score
0  [I, have, bought, several, of, the, Vitality, ...      1
1  [Product, arrived, labeled, as, Jumbo, Salted,...      0
2  [This, is, a, confection, that, has, been, aro...      1
3  [If, you, are, looking, for, the, secret, ingr...      0
4  [Great, taffy, at, a, great, price., There, wa...      1


In [4]:
#  去除停頓詞stop words 
#下載NLTK 的 stopwords 資料
nltk.download('stopwords')

# 從 NLTK 中載入英語停頓詞，並將其轉換為列表格式
stop_words = list(stopwords.words('english'))


# 初始化 CountVectorizer，其中 stop_words=stop_words 表示在向量化時自動過濾掉指定的停頓詞。
vectorizer = CountVectorizer(stop_words=stop_words)

# 將文本轉換為詞頻矩陣
X_counts = vectorizer.fit_transform(df['Text'].apply(lambda x: ' '.join(x)))


#確保去除停頓詞
print(vectorizer.vocabulary_)

# 查看詞彙矩陣
print(X_counts.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(10000, 18654)


In [5]:
#將文字轉換成向量，請實作 tf-idf 及 word2vec 並進行比較
# 使用tf-idf
# 使用 TfidfVectorizer 來轉換文本為向量
tfidf_vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2), max_df=0.9, min_df=5)
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'].apply(lambda x: ' '.join(x)))

print(X_tfidf.toarray())  # TF-IDF 矩陣
print(tfidf_vectorizer.get_feature_names_out())  # 特徵名稱
# 檢查 TF-IDF 矩陣的形狀
print(X_tfidf.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['00' '00 box' '000' ... 'zucchini' 'zuke' 'zukes']
(10000, 12619)


In [6]:
#使用 Random Forest 進行分類
# 以 TF-IDF 為例

y = df['Score']

# 建模：使用 Random Forest 進行分類
classifier = RandomForestClassifier()

#使用 TF-IDF 向量化後的文本數據 (X_tfidf) 和標籤 (df['Score']）來訓練隨機森林模型。
classifier.fit(X_tfidf, df['Score'])


# 評估模型：進行 k-fold cross-validation 並計算 k=4 的 Accuracy
scores = cross_val_score(classifier, X_tfidf, y, cv=4,scoring='accuracy')

classifier.fit(X_tfidf, y)

#輸出交叉驗證的準確性
print("Cross-Validation Accuracy: ", scores)
#輸出交叉驗證的平均準確性
print("TF-IDF Mean Accuracy: ", scores.mean())

Cross-Validation Accuracy:  [0.806  0.808  0.8064 0.8112]
TF-IDF Mean Accuracy:  0.8079


In [7]:
#使用 Word2Vec

#下載停用詞
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 去除停用詞
df['Text'] = df['Text'].apply(lambda x: [word for word in x if word not in stop_words])

# 訓練 Word2Vec 模型
w2v_model = Word2Vec(sentences=df['Text'], vector_size=500, window=20, min_count=1, workers=4)

# 計算每個文本的平均詞向量
def get_average_vector(text):
    vectors = [w2v_model.wv[word] for word in text if word in w2v_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)  # 若沒有有效詞，返回零向量

# 將所有 Tokenized_Text 列轉換為詞向量
df['Text_Vector'] = df['Text'].apply(get_average_vector)

# 構建輸入矩陣 X
X = np.stack(df['Text_Vector'].values)

# 構建輸出標籤 y (df['Score'] 已處理為 0 和 1)
y = df['Score'].values

# 構建隨機森林分類器
rf = RandomForestClassifier()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#模型評估：k-fold cross-validation k=4 的交叉驗證
# 使用 Word2Vec 特徵進行 4-fold cross-validation
accuracy_scores = cross_val_score(rf, X, y, cv=4, scoring='accuracy')

#輸出交叉驗證的準確性分數
print(f'Accuracy for each fold: {accuracy_scores}')
#輸出交叉驗證的平均準確性分數
print(f'Word2Vec 平均準確率: {accuracy_scores.mean()}')

Accuracy for each fold: [0.7472 0.7508 0.7512 0.7416]
Word2Vec 平均準確率: 0.7477
