# 安裝必要的套件

In [None]:
!pip install jieba --quiet
!pip install tensorflow --quiet
!pip install pandas --quiet

# 步驟 1: 上傳檔案

In [None]:
from google.colab import files
uploaded = files.upload()  # 選擇上傳 Cleaned_Positive_Words.xlsx 和 Negative_Words_List.xlsx

# Main Code

### Import packages

In [1]:
# 導入必要模組
import jieba
import numpy as np
import pandas as pd
from pathlib import Path

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.src.models import Sequential
from keras.src.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

2025-02-07 16:20:25.500460: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-07 16:20:25.537393: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-07 16:20:25.548403: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-07 16:20:25.690322: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### define path

In [None]:
dataset_path = Path("./data/emotion_analysis/dataset/emotion_analyse_dataset.csv")
data_frame = pd.read_csv(dataset_path)
positive_words_csv = data_frame[data_frame['is_positive']].dropna()
negative_words_csv = data_frame[~data_frame['is_positive']].dropna()

In [2]:
# 步驟 2: 設置檔案路徑
positive_data_path = './data/emotion_analysis/dataset/positive_words.xlsx'
negative_data_path = './data/emotion_analysis/dataset/negative_words.xlsx'

# 步驟 3: 讀取資料
positive_reviews = pd.read_excel(positive_data_path, header=None)
negative_reviews = pd.read_excel(negative_data_path, header=None)

# 提取正面和負面詞彙
positive_words = positive_reviews[1][1:].dropna()  # 跳過標題行並移除空值
negative_words = negative_reviews[1][1:].dropna()

In [4]:
# 構造影評資料集
positive_samples = ["這是一個非常" + word + "的產品，值得推薦！" for word in positive_words]
negative_samples = ["這是一個非常" + word + "的產品，完全不推薦！" for word in negative_words]

# 創建標籤
positive_labels = [1] * len(positive_samples)  # 正面為1
negative_labels = [0] * len(negative_samples)  # 負面為0

# 合併影評與標籤
texts = positive_samples + negative_samples
labels = positive_labels + negative_labels

In [5]:
# 步驟 4: 定義分詞函數
def preprocess_texts(texts):
    return [" ".join(jieba.lcut(text)) for text in texts]

# 分詞處理影評
texts = preprocess_texts(texts)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.482 seconds.
Prefix dict has been built successfully.


In [6]:
# 步驟 5: 文本編碼與數據處理
num_words = 5000  # 詞彙表大小
maxlen = 100      # 每條影評的最大長度

# 使用 Tokenizer 將影評轉為數字序列
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)  # 建立詞彙表
sequences = tokenizer.texts_to_sequences(texts)  # 將影評轉為數字序列
x_data = pad_sequences(sequences, maxlen=maxlen)  # 填充序列
y_data = np.array(labels)  # 標籤

# 拆分訓練集和測試集
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [None]:
# 步驟 6: 建立 LSTM 模型
model = Sequential([
    Embedding(input_dim=num_words, output_dim=64, input_length=maxlen),  # 嵌入層
    LSTM(units=128, return_sequences=False),  # LSTM 層
    Dropout(0.5),  # Dropout 防止過擬合
    Dense(units=1, activation='sigmoid')  # 輸出層
])

# 編譯模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 訓練模型
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

In [None]:
# 步驟 7: 測試模型
sample_review = "這是一個非常感人的產品，值得推薦！"  # 測試用影評
sample_review = " ".join(jieba.lcut(sample_review))  # 分詞
sample_sequence = tokenizer.texts_to_sequences([sample_review])  # 轉為數字序列
sample_padded = pad_sequences(sample_sequence, maxlen=maxlen)  # 填充序列

In [None]:
# 預測結果
prediction = model.predict(sample_padded)
print("Prediction (Positive Sentiment Probability):", prediction[0][0])

# 結果解釋
if prediction[0][0] > 0.5:
    print("這是一條正面評價！")
else:
    print("這是一條負面評價！")