In [46]:
import pandas as pd
#1. 資料前處理
#a. 讀取csv檔僅保留"text"、"stars"兩個欄位，並將stars欄位內值大於等於4的轉成1，其餘轉成0，1: positive; 0: negative
df = pd.read_csv("./Data/yelp.csv")
df = df[["stars", "text"]]
df['stars'] = df['stars'].map(lambda x: 1 if x >= 4 else 0)  # 二元分類

In [47]:
#b.去除停頓詞stop words 
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 去除停用詞
nltk.download('stopwords')
nltk_stopwords = nltk.corpus.stopwords.words('english')
stop_words = nltk_stopwords

# 文字轉向量：CountVectorizer 和去除停用詞
corpus = df["text"].tolist()
vectorizer = CountVectorizer(stop_words=stop_words, min_df=0.01)
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeweilin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# 使用 TF-IDF 轉換文本
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
tfidf_vec = tfidf.toarray()

# 創建 DataFrame 包含 TF-IDF 向量及其標籤
tfidf_df = pd.DataFrame(tfidf_vec, columns=features)
tfidf_df['y'] = df['stars']

# 去除缺失值
tfidf_df = tfidf_df.dropna()

# 顯示 DataFrame
print(tfidf_df.head())

    00   10  100        11   12   15   20   25   30   40  ...  yeah  year  \
0  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
1  0.0  0.0  0.0  0.106818  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
2  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
3  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   
4  0.0  0.0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   

   years  yelp  yes  yet  young  yum  yummy  y  
0    0.0   0.0  0.0  0.0    0.0  0.0    0.0  1  
1    0.0   0.0  0.0  0.0    0.0  0.0    0.0  1  
2    0.0   0.0  0.0  0.0    0.0  0.0    0.0  1  
3    0.0   0.0  0.0  0.0    0.0  0.0    0.0  1  
4    0.0   0.0  0.0  0.0    0.0  0.0    0.0  1  

[5 rows x 1049 columns]


In [49]:
text_seg_binary_list = X.toarray()
text_seg_list = []

for row in text_seg_binary_list:
    temp_list = [features[j] for j in range(len(row)) if row[j] == 1]
    text_seg_list.append(temp_list)

In [50]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

# 訓練 Word2Vec
vector_size = 250
model = Word2Vec(sentences=text_seg_list, vector_size=vector_size, epochs=10)

# word embedding
word2vec_vec = [] 
for i in range(len(text_seg_list)):
    vector_sum = np.zeros(vector_size)
    count = 0
    for j in range(len(text_seg_list[i])):
        word = text_seg_list[i][j]
        if word in model.wv:  # 檢查詞是否在模型的詞彙中
            vector_sum += model.wv[word]
            count += 1
    
    if count > 0:  # 確保不會除以零
        vector_average = vector_sum / count
    else:
        vector_average = np.zeros(vector_size)  # 若沒有有效詞向量，則使用零向量
    
    word2vec_vec.append(vector_average.tolist())

# 將結果放入 DataFrame
word2vec_df = pd.DataFrame(word2vec_vec)
word2vec_df['y'] = df['stars']
word2vec_df = word2vec_df.dropna()

# 顯示結果
print(word2vec_df.head())

          0         1         2         3         4         5         6  \
0 -0.055891  0.119856 -0.055028 -0.003073 -0.047791 -0.006065  0.058628   
1  0.022537 -0.009645  0.039787 -0.065693 -0.117083  0.080017  0.114434   
2  0.133588 -0.117555  0.243738 -0.074329 -0.337692  0.185905  0.294554   
3 -0.109145  0.077581  0.071527 -0.066897 -0.020368 -0.261418  0.243756   
4 -0.115342  0.146493 -0.168324 -0.078551 -0.047777 -0.153304  0.091618   

          7         8         9  ...       241       242       243       244  \
0  0.043664 -0.017931 -0.087957  ...  0.089980 -0.017116  0.094483  0.046779   
1  0.004489 -0.065123 -0.057993  ...  0.096067  0.034653  0.166542 -0.026478   
2 -0.172235 -0.144143 -0.564826  ...  0.480761 -0.034307  0.284486  0.115924   
3 -0.227156  0.234997 -0.065524  ...  0.152017 -0.064036  0.148587 -0.097497   
4 -0.142137  0.130455  0.099909  ...  0.130070  0.071522  0.116430 -0.077419   

        245       246       247       248       249  y  
0  0.048924

In [51]:
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def K_fold_CV(k, data, model):
    kf = KFold(n_splits=k, shuffle=True, random_state=123)  # 使用 KFold 進行資料分割
    Accuracy = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(data), 1):
        print(f"目前 test fold = {fold}", end=" ")
        
        train_x, test_x = data.iloc[train_idx].drop("y", axis=1), data.iloc[test_idx].drop("y", axis=1)
        train_y, test_y = data.iloc[train_idx]["y"], data.iloc[test_idx]["y"]
        
        model.fit(train_x, train_y)
        test_y_predicted = model.predict(test_x)

        acc = accuracy_score(test_y, test_y_predicted)  # 計算準確度
        Accuracy.append(acc)
        
        print(f", accuracy = {round(acc, 4)}")

    avg_acc = np.mean(Accuracy)
    print("---------------------------------------")
    print(f"Average accuracy: {round(avg_acc, 4)}")
    #return round(avg_acc, 4)


In [52]:
# 建立隨機森林模型
forest = RandomForestClassifier(n_estimators=200, criterion="entropy", random_state=123)
# 執行 4-fold CV
K_fold_CV(4, tfidf_df, forest)

目前 test fold = 1 , accuracy = 0.8012
目前 test fold = 2 , accuracy = 0.8088
目前 test fold = 3 , accuracy = 0.802
目前 test fold = 4 , accuracy = 0.8
---------------------------------------
Average accuracy: 0.803


In [53]:
# 建立 random forest 模型
forest = RandomForestClassifier(n_estimators = 200, criterion="entropy", random_state=123)
# word2vec 4-fold cv
K_fold_CV(4, word2vec_df, forest)

目前 test fold = 1 , accuracy = 0.7004
目前 test fold = 2 , accuracy = 0.6964
目前 test fold = 3 , accuracy = 0.7008
目前 test fold = 4 , accuracy = 0.7032
---------------------------------------
Average accuracy: 0.7002
