- run in Google Colab and use T4 GPU

In [None]:
!pip install simpletransformers
!pip install torchvision
!pip install stanza
!pip install -U ckip-transformers

In [None]:
import stanza
stanza.download('zh')  # 下載繁體中文模型

In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
from ckip_transformers.nlp import CkipPosTagger
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

In [None]:
drive.mount('/content/drive')  # 連接至 Google Drive

file_path = '/content/drive/MyDrive/Colab Notebooks/output.csv'  # 此檔為原始資料集
fb_data = pd.read_csv(file_path)
fb_label_df = fb_data.rename(columns={'label': 'labels', 'clean_content': 'text'})
del fb_data

# 切分訓練集與測試集，並保持類別分佈一致
train_df, eval_df = train_test_split(fb_label_df, test_size=0.2, random_state=1, stratify=fb_label_df['labels'])

In [None]:
# 對訓練集做詞性標註 (POS tagging)、依存句法分析 (Dependency Parsing) 以及 TF-IDF 關鍵詞萃取

# 初始化工具
pos_tagger = CkipPosTagger()  # Ckip 詞性標註
nlp = stanza.Pipeline(lang="zh", processors="tokenize,pos,lemma,depparse")  # Stanza 句法分析

def process_text_with_pos_dep(text_list):
    pos_result = pos_tagger(text_list)
    processed_texts = []

    for text, pos in zip(text_list, pos_result):
        doc = nlp(text)
        dep_info = []

        for sent in doc.sentences:
            for word in sent.words:
                dep_info.append(f"{word.text}({word.deprel},{sent.words[word.head - 1].text if word.head > 0 else 'ROOT'})")

        # 組合 POS & Dependency
        pos_str = " ".join([f"{word}({pos_tag})" for word, pos_tag in zip(text.split(), pos)])
        dep_str = " ".join(dep_info)

        processed_texts.append(f"{pos_str} {dep_str}")

    return processed_texts

# 應用 POS & 句法分析
train_df['text'] = process_text_with_pos_dep(train_df['text'].tolist())

# Compute TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
vectorizer.fit(train_df['text'])  # TF-IDF 關鍵詞

def get_top_tfidf_words(text, vectorizer, top_n=5):
    feature_array = vectorizer.transform([text]).toarray()[0]
    sorted_indices = feature_array.argsort()[::-1][:top_n]  # Top N words
    feature_names = vectorizer.get_feature_names_out()
    keywords = [feature_names[i] for i in sorted_indices if feature_array[i] > 0]
    return " ".join(keywords)

# Add TF-IDF keywords to text
train_df['text'] = train_df['text'].apply(lambda x: x + " [重要詞] " + get_top_tfidf_words(x, vectorizer))

In [None]:
train_args = {
    'num_train_epochs': 10,
    'train_batch_size': 8,
    'eval_batch_size': 64,
    'max_seq_length': 512,  # 序列長度
    'weight_decay': 0.01,
    'gradient_accumulation_steps': 2,  # 開啟梯度累積
    'logging_steps': 10,
    'learning_rate': 1.5e-5,
    'warmup_ratio': 0.05,
    'fp16': True,  # 開啟半精度訓練
    'overwrite_output_dir': True
}

In [None]:
# 初始化並訓練模型
model = ClassificationModel('bert', 'ckiplab/bert-base-chinese', use_cuda=True, cuda_device=0, args=train_args)
model.train_model(train_df)

In [None]:
# 模型評估
import sklearn
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=sklearn.metrics.accuracy_score)
result

In [None]:
# 存到 google drive
torch.save(model.model.state_dict(), 'complete_model1.pt')
!cp complete_model1.pt '/content/drive/MyDrive/Colab Notebooks/'