In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
import logging

logging.basicConfig(level=logging.INFO, filename='03_Model_RandomForest_01.log', filemode='a', format='%(asctime)s %(levelname)s: %(message)s')

print("Program Start !!!")

# 讀取資料
data = pd.read_csv("01_DataPreFix.csv")
print("讀取資料 Finish !!!")

# 將 NaN 值替換為空字符串
data["ProcessedReview"].fillna("", inplace=True)
print("將 NaN 值替換為空字符串 Finish !!!")

# 特徵表示
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['ProcessedReview'])
y = data['IsInducing']
print("特徵表示 Finish !!!")

# 切分資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("切分資料集 Finish !!!")

# 選擇模型
model = RandomForestClassifier(random_state=42)
print("RandomForestClassifier Finish !!!")

# 訓練模型
model.fit(X_train, y_train)
print("訓練模型 Finish !!!")

# 預測
y_pred = model.predict(X_test)
print("預測 Finish !!!")

# 評估模型
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)
avg_precision = average_precision_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC-ROC: ', auc_roc)
print('Avg Precision Score: ', avg_precision)

# 特徵重要性分析
feature_importances = model.feature_importances_
important_words = sorted(zip(vectorizer.get_feature_names_out(), feature_importances), key=lambda x: x[1], reverse=True)
print("特徵重要性分析 Finish !!!")

print("Important words:")
for word, importance in important_words[:10]:
    print(word, importance)


Program Start !!!
讀取資料 Finish !!!
將 NaN 值替換為空字符串 Finish !!!
特徵表示 Finish !!!
切分資料集 Finish !!!
RandomForestClassifier Finish !!!
訓練模型 Finish !!!
預測 Finish !!!
Accuracy: 0.9978010606648557
Precision: 0.8148148148148148
Recall: 0.25882352941176473
F1 Score: 0.3928571428571429
特徵重要性分析 Finish !!!
Important words:
打卡 0.22962589280270113
五星 0.03945079687037591
好吃 0.01682103887205561
打卡送 0.015407197389045287
薯條 0.013752596080369987
評論 0.01074377875152343
飲料 0.008577151881015713
好評 0.006575960619537161
小菜 0.006551376879215902
服務 0.006160834174905235


使用網格搜索進行超參數調整

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
import logging

logging.basicConfig(level=logging.INFO, filename='03_Model_RandomForest_02.log', filemode='a', format='%(asctime)s %(levelname)s: %(message)s')

print("Program Start !!!")

# 讀取資料
data = pd.read_csv("01_DataPreFix.csv")
print("讀取資料 Finish !!!")

# 將 NaN 值替換為空字符串
data["ProcessedReview"].fillna("", inplace=True)
print("將 NaN 值替換為空字符串 Finish !!!")

# 特徵表示
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['ProcessedReview'])
y = data['IsInducing']
print("特徵表示 Finish !!!")

# 切分資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("切分資料集 Finish !!!")

# 選擇模型
model = RandomForestClassifier(random_state=42)
print("RandomForestClassifier Finish !!!")

# 設定網格搜索的參數範圍
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 執行網格搜索
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# 獲取最佳參數
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# 使用最佳參數重新訓練模型
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# 預測
y_pred = best_model.predict(X_test)
print("預測 Finish !!!")

# 評估模型
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)
avg_precision = average_precision_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC-ROC: ', auc_roc)
print('Avg Precision Score: ', avg_precision)

# 特徵重要性分析
feature_importances = best_model.feature_importances_
important_words = sorted(zip(vectorizer.get_feature_names_out(), feature_importances), key=lambda x: x[1], reverse=True)
print("特徵重要性分析 Finish !!!")

print("Important words:")
for word, importance in important_words[:10]:
    print(word, importance)


Program Start !!!
讀取資料 Finish !!!
將 NaN 值替換為空字符串 Finish !!!
特徵表示 Finish !!!
切分資料集 Finish !!!
RandomForestClassifier Finish !!!
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
預測 Finish !!!
Accuracy: 0.9978010606648557
Precision: 0.8148148148148148
Recall: 0.25882352941176473
F1 Score: 0.3928571428571429
特徵重要性分析 Finish !!!
Important words:
打卡 0.22962589280270113
五星 0.03945079687037591
好吃 0.01682103887205561
打卡送 0.015407197389045287
薯條 0.013752596080369987
評論 0.01074377875152343
飲料 0.008577151881015713
好評 0.006575960619537161
小菜 0.006551376879215902
服務 0.006160834174905235
