In [None]:
!pip install datasets
!pip install scikit-learn
!pip install -U transformers --upgrade --force-reinstall
!pip install pandas

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from transformers)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad



In [None]:

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import os
os.environ["WANDB_DISABLED"] = "true"

# 讀取資料集
df = pd.read_csv("final_fraud_and_normal_messages_1000.csv")

# 拆分訓練集與測試集
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["是否詐騙"])

# 轉換為 Hugging Face 的 Dataset 格式
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))


# 載入 BERT 分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

def tokenize(example):
    return tokenizer(example["留言內容"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)
train_dataset = train_dataset.rename_column("是否詐騙", "label")
test_dataset = test_dataset.rename_column("是否詐騙", "label")
# 載入 BERT 模型（分類用）
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)

# 設定訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # Changed evaluation_strategy to eval_strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    # Add remove_unused_columns=False to prevent issues with extra columns
    remove_unused_columns=False
)

from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# 建立 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 開始訓練
trainer.train()
trainer.save_model("./results")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,8.7e-05,1.0
2,No log,4.8e-05,1.0
3,No log,4e-05,1.0


In [None]:
# 壓縮整個模型資料夾
!zip -r bert_trained_model.zip ./results

# 下載 zip 檔
from google.colab import files
files.download("bert_trained_model.zip")


  adding: results/ (stored 0%)
  adding: results/checkpoint-300/ (stored 0%)
  adding: results/checkpoint-300/model.safetensors (deflated 7%)
  adding: results/checkpoint-300/scheduler.pt (deflated 56%)
  adding: results/checkpoint-300/rng_state.pth (deflated 24%)
  adding: results/checkpoint-300/config.json (deflated 54%)
  adding: results/checkpoint-300/training_args.bin (deflated 52%)
  adding: results/checkpoint-300/trainer_state.json (deflated 64%)
  adding: results/checkpoint-300/optimizer.pt (deflated 23%)
  adding: results/checkpoint-100/ (stored 0%)
  adding: results/checkpoint-100/model.safetensors (deflated 7%)
  adding: results/checkpoint-100/scheduler.pt (deflated 56%)
  adding: results/checkpoint-100/rng_state.pth (deflated 24%)
  adding: results/checkpoint-100/config.json (deflated 54%)
  adding: results/checkpoint-100/training_args.bin (deflated 52%)
  adding: results/checkpoint-100/trainer_state.json (deflated 57%)
  adding: results/checkpoint-100/optimizer.pt (deflate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
trainer.save_model("./results")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 載入剛剛訓練好的模型與分詞器
model = BertForSequenceClassification.from_pretrained("./results")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 預測函式
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    label = torch.argmax(probs).item()
    confidence = probs[0][label].item()
    return "詐騙留言" if label == 1 else "正常留言", confidence

In [None]:
text = "出售iPad Mini，先匯款，面交"
result, score = predict(text)
print(f"結果：{result}（信心分數：{score:.2f}）")

結果：詐騙留言（信心分數：0.97）


In [None]:
text = "出售iPad Mini，先匯款保留，可於台北面交，私訊聊"
result, score = predict(text)
print(f"結果：{result}（信心分數：{score:.2f}）")

結果：詐騙留言（信心分數：1.00）


In [None]:
text = "出售iPad Mini，含面交時付款"
result, score = predict(text)
print(f"結果：{result}（信心分數：{score:.2f}）")

結果：正常留言（信心分數：0.87）


In [None]:
# 準確度
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
trainer.evaluate()

{'eval_loss': 2.993955058627762e-05,
 'eval_accuracy': 1.0,
 'eval_runtime': 5.5576,
 'eval_samples_per_second': 35.987,
 'eval_steps_per_second': 4.498,
 'epoch': 3.0}

In [None]:
results = trainer.evaluate()
print(f"模型準確率：{results['eval_accuracy']:.4f}")

模型準確率：1.0000


※ 如何在自己的電腦上使用🤔：


In [None]:
# Step 1：掛載 Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 2：複製 zip 檔到 Colab 並解壓縮
!cp "/content/drive/MyDrive/計算思維與人工智慧期末/bert_trained_model.zip" . #此處要改成zip在你雲端的上傳位置
!unzip bert_trained_model.zip -d ./results



In [None]:
# Step 3：安裝套件（只需執行一次）
!pip install -U transformers
!pip install torch


In [None]:
# Step 4：載入模型並定義預測函式
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 載入模型與分詞器
model = BertForSequenceClassification.from_pretrained("./results/results/checkpoint-300")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 預測函式
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    label = torch.argmax(probs).item()
    confidence = probs[0][label].item()
    return "詐騙留言" if label == 1 else "正常留言", confidence



In [None]:
# Step 5：輸入你要測試的留言
text = "出售PS5主機"
result, score = predict(text)
print(f"判斷結果：{result}（信心分數：{score:.2f}）")