In [1]:
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from transformers import BertModel, BertTokenizer

# set device to gpu
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
device

  from .autonotebook import tqdm as notebook_tqdm


'mps'

In [2]:
def full_to_half(s):
    # 將字符串 s 中的全形字符和標點符號轉換為半形。
    n = []
    for char in s:
        code = ord(char)
        if code == 0x3000:  # 全形空格直接轉換
            code = 32
        elif 0xFF01 <= code <= 0xFF5E:  # 全形字符（除空格）轉換成半形
            code -= 0xFEE0
        n.append(chr(code))
    return ''.join(n)

In [3]:
with open('../Data/stopwords_tc.txt', encoding='utf-8', mode='r') as f:
    stop_words = []
    for l in f:
        stop_words.append(l.strip())

In [4]:
ptt_food_post_df = pd.read_csv('../Data/Ptt/ptt_food_post_list.csv', index_col='Pid')
gpt_food_post_df = pd.read_csv('../Data/ChatGPT/chatgpt_generated_articles.csv')

In [5]:
def remove_urls_and_phones(text):
    """
    移除文字中的網址和電話號碼。
    """
    # 正則表達式匹配網址
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)

    # 正則表達式匹配電話號碼（適用於多種常見格式）
    phone_pattern = r'(\d{2,4}[-.\s]??\d{3,4}[-.\s]??\d{3,4}|\(\d{2,4}\)\s*\d{3,4}[-.\s]??\d{3,4}|\d{10,11})'
    text = re.sub(phone_pattern, '', text)

    return text

In [6]:
def remove_english(text):
    """
    移除文字中的所有英文字符。
    """
    # 正則表達式匹配所有英文字母和英文單詞
    pattern = r'[A-Za-z]+'
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [7]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # 表情符號
        "\U0001F300-\U0001F5FF"  # 符號和圖案
        "\U0001F680-\U0001F6FF"  # 交通和符號
        "\U0001F700-\U0001F77F"  # 藝術符號
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', text)

In [8]:
def load_dictionary(file_path):
    with open(file_path, encoding='utf-8', mode='r') as f:
        dictionary = [l.strip() for l in f]
    return dictionary

In [9]:
def remove_stopwords(text, stopwords):
    # 使用列表推導式過濾掉停用詞
    filtered_words = remove_urls_and_phones(text)
    filtered_words = remove_english(filtered_words)
    filtered_words = remove_emojis(filtered_words)
    filtered_words = "".join(c for c in filtered_words if c not in ('；','，','。','！','：','「','」','…','、','？','【','】','.',':','?',';','!','~','`','+','-','<','>','/','[',']','{','}',"'",'"','\\', ' ', '‧','・','◢','◤','\n','★','☆','◆','◇','◎','○','●','◐','◑','▲','▼','△','▽','◢','◣','◥','◤','▷','◁','▶','◀','♠','♣','♥','♦','♨','⊙','⊕','▨','▧','▦','▥','▤','▣','▢','□','■'))
    filtered_words = "".join(word for word in filtered_words if word not in stopwords)
    filtered_words = filtered_words.replace(u'\u3000', u' ')
    # 將過濾後的單詞列表重新組合成字符串
    return filtered_words

In [10]:
ptt_contents = ptt_food_post_df.loc[:, 'content']
gpt_contents = gpt_food_post_df.loc[:, 'content']
stop_words = load_dictionary('../Data/stopwords_tc.txt')
ptt_remove_stopword_contents = []
gpt_remove_stopword_contents = []
for c in ptt_contents:
    ptt_remove_stopword_contents.append(remove_stopwords(c, stop_words))

for c in gpt_contents:
    gpt_remove_stopword_contents.append(remove_stopwords(c, stop_words))

# 添加標記並合併 DataFrame
ptt_contents_df = pd.DataFrame(ptt_remove_stopword_contents, columns=['remove_stopword_content'])
ptt_contents_df['source'] = 'ptt'

gpt_contents_df = pd.DataFrame(gpt_remove_stopword_contents, columns=['remove_stopword_content'])
gpt_contents_df['source'] = 'gpt'

combined_df = pd.concat([ptt_contents_df, gpt_contents_df], ignore_index=True)
combined_df

Unnamed: 0,remove_stopword_content,source
0,餐廳名稱片消費時間年月電話址台南市仁德區空路號營業時間日圖文版片位台南空眷村家低調披蕯店原平...,ptt
1,餐廳名稱辰壽司割烹消費時間年月址台北市松山區敦化北路號營業時間平價位圖文月底結束前朋友揪局覺...,ptt
2,鍋物前線金鋤壽喜燒烤鍋物新莊店午茶消費日期年月圖真相圖文茂版點新北市新莊區幸福路號電話價位動...,ptt
3,餐廳名稱林口井放鬆心情吃港式點點心消費時間址新北市林口區文化路段號樓電話營業時間週週週週週日...,ptt
4,餐廳名稱夯魯肉飯消費時間年月台北市信義區松山路號圖文網誌分數低評破位數裝潢實文青說真點詹記感...,ptt
...,...,...
2008,食記屏東潮州牛福屏東牛肉料理家次家分享屏東潮州區家美味牛肉料理餐廳牛福裡品嚐道美味牛肉料理回...,gpt
2009,食記評肉次夯餐廳名稱肉次消費時間址台北市安區復興南路段號電話營業時間週週日進入肉次空間刻溫暖...,gpt
2010,食記桃園龍潭糧園茶藝客家館餐廳名稱糧園茶藝客家館消費時間址桃園市龍潭區路號電話營業時間週週日...,gpt
2011,食記台北信義安吉頌丹麥專賣店消費時間年月址台北市信義區松仁路號電話營業時間週週日天台北信義區...,gpt


In [11]:
# 使用 Jieba 進行切詞
def jieba_cut(text):
    stop_words = {'食記', '網誌'}
    words = jieba.lcut(text)
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

In [12]:
combined_df['tokenized_text'] = combined_df['remove_stopword_content'].apply(jieba_cut)
tokens = combined_df['tokenized_text'].tolist()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/rh/7g0n3djn3wj8dlnjv7plmzcr0000gn/T/jieba.cache
Loading model cost 0.144 seconds.
Prefix dict has been built succesfully.


In [13]:
# 初始化BERT模型和Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# 定义解码器
class Decoder(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super(Decoder, self).__init__()
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        return self.linear(x)

# 定义数据集
class TextDataset(Dataset):
    def __init__(self, tokens, tokenizer):
        self.tokens = tokens
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        tokens = self.tokens[idx]
        encoded = self.tokenizer(tokens, is_split_into_words=True, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return {key: val.squeeze(0) for key, val in encoded.items()}

# 初始化BERT模型和Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')
decoder = Decoder(hidden_size=768, vocab_size=tokenizer.vocab_size)

model.to(device)
decoder.to(device)

# 定义优化器和混合精度缩放器
optimizer = torch.optim.AdamW(list(model.parameters()) + list(decoder.parameters()), lr=5e-5)
scaler = GradScaler()

# 数据加载
train_dataset = TextDataset(tokens, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # 较小的批量大小

# 训练循环
accumulation_steps = 4  # 梯度累积步数
model.train()
decoder.train()
optimizer.zero_grad()
for epoch in range(5):  # 训练多个epoch
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}  # 将数据移动到设备
        with autocast():
            outputs = model(**batch)
            last_hidden_states = outputs.last_hidden_state
            reconstructed = decoder(last_hidden_states)
            loss = F.cross_entropy(reconstructed.view(-1, reconstructed.size(-1)), batch['input_ids'].view(-1))
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}")

    # 清理缓存
    torch.cuda.empty_cache()

# 保存模型和解码器状态字典
torch.save(model.state_dict(), '../Data/Model/bert_model_mps.pth')
torch.save(decoder.state_dict(), '../Data/Model/decoder_mps.pth')



Epoch: 0, Step: 0, Loss: 9.936542510986328
Epoch: 0, Step: 1, Loss: 10.012861251831055
Epoch: 0, Step: 2, Loss: 9.924552917480469
Epoch: 0, Step: 3, Loss: 9.912943840026855
Epoch: 0, Step: 4, Loss: 9.398380279541016
Epoch: 0, Step: 5, Loss: 8.884424209594727
Epoch: 0, Step: 6, Loss: 9.082435607910156
Epoch: 0, Step: 7, Loss: 8.94564437866211
Epoch: 0, Step: 8, Loss: 7.557916641235352
Epoch: 0, Step: 9, Loss: 8.460612297058105
Epoch: 0, Step: 10, Loss: 7.973822593688965
Epoch: 0, Step: 11, Loss: 8.108795166015625
Epoch: 0, Step: 12, Loss: 8.114551544189453
Epoch: 0, Step: 13, Loss: 6.870382785797119
Epoch: 0, Step: 14, Loss: 9.815645217895508
Epoch: 0, Step: 15, Loss: 7.4393439292907715
Epoch: 0, Step: 16, Loss: 7.413608551025391
Epoch: 0, Step: 17, Loss: 8.73604965209961
Epoch: 0, Step: 18, Loss: 5.902551651000977
Epoch: 0, Step: 19, Loss: 5.912508964538574
Epoch: 0, Step: 20, Loss: 7.631427764892578
Epoch: 0, Step: 21, Loss: 4.874484062194824
Epoch: 0, Step: 22, Loss: 6.01130437850952