In [1]:
import pandas as pd
import jieba
import re
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import gensim
import numpy as np
import pickle

# set device to gpu
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
device

  from .autonotebook import tqdm as notebook_tqdm


'mps'

In [2]:
def full_to_half(s):
    # 將字符串 s 中的全形字符和標點符號轉換為半形。
    n = []
    for char in s:
        code = ord(char)
        if code == 0x3000:  # 全形空格直接轉換
            code = 32
        elif 0xFF01 <= code <= 0xFF5E:  # 全形字符（除空格）轉換成半形
            code -= 0xFEE0
        n.append(chr(code))
    return ''.join(n)

In [3]:
with open('../Data/stopwords_tc.txt', encoding='utf-8', mode='r') as f:
    stop_words = []
    for l in f:
        stop_words.append(l.strip())

In [4]:
breeze_food_post_df = pd.read_csv('../Data/LocalLLM/breeze_generated_articles.csv')
breeze_food_post_df

Unnamed: 0,title,content,prompt_system,prompt_user
0,[食記] 台南 來一片。一周只賣三天的美味披蕯店,【食記】台南 來一片。一周只賣三天的美味披薩店\n\n各位大德大家好，今天要來跟大家分享一家...,你是美食熱愛者,寫一篇「[食記] 台南 來一片。一周只賣三天的美味披蕯店」為標題的文章，風格：PTT發文風格...
1,[食記] 台北 辰壽司割烹~大閘蟹宴,【食記】台北 辰壽司割烹~大閘蟹宴\n\n在美食論壇發表食記，先提供餐廳名稱、消費時間(20...,你是美食部落客,在美食論壇發表食記，先提供餐廳名稱、消費時間(2022/11以前)、地址、電話、營業時間，引...
2,[食記] 新北 金大鋤壽喜燒烤鍋物新莊店下午茶,【食記】新北 金大鋤壽喜燒鍋物新莊店下午茶\n\n🌟🌟🌟🌟🌟（5/5）\n\n各位大大好，今...,你是美食評論家,寫一篇「[食記] 新北 金大鋤壽喜燒烤鍋物新莊店下午茶」為標題的文章，風格：PTT發文風格，...
3,[食記] 新北林口｜林口三井的點點心,【食記】新北林口｜林口三井的點點心\n\n消費時間：2018年3月\n地址：新北市林口區文化...,你是美食評論家,發表食記，先提供餐廳名稱、消費時間、地址、電話、營業時間，消費時間隨機選2022年以前的月份...
4,[食記] 台北松山 夯・魯肉飯 原本的五分埔魯肉飯,【食記】台北松山 夯・魯肉飯 原本的五分埔魯肉飯\n\n在美食論壇發表食記，先提供餐廳名稱、...,你是美食熱愛者,在美食論壇發表食記，先提供餐廳名稱、消費時間(2022/11以前)、地址、電話、營業時間，引...
...,...,...,...,...
1002,[食記] 台北 中山區 欣葉台菜創始店,【食記】台北 中山區 欣葉台菜創始店\n\n各位大德大家好，今天要來跟大家分享一下我最近去的...,你是美食熱愛者,寫一篇「[食記] 台北 中山區 欣葉台菜創始店」為標題的文章，風格：PTT發文風格，要有餐廳...
1003,[食記] 台北信義-寓所咖啡Utroo cafe-咖啡廳,【食記】台北信義 - 寓所咖啡 Utroo Cafe - 咖啡廳\n\n消費時間：2018年...,你是美食熱愛者,發表食記，先提供餐廳名稱、消費時間、地址、電話、營業時間，消費時間隨機選2022年以前的月份...
1004,[食記] 屏東潮州-牛大福。屏東牛肉料理,【食記】屏東潮州-牛大福。屏東牛肉料理\n\n今天要跟大家分享的是位於屏東潮州的「牛大福」，...,你是美食部落客,以PTT美食版發文風格，寫一篇「[食記] 屏東潮州-牛大福。屏東牛肉料理」為標題的文章，要包...
1005,[食記] 個人評比 肉次方VS夯下去,【食記】個人評比 肉次方VS夯下去\n\n消費時間：2018年3月\n地址：台北市萬華區中華...,你是美食熱愛者,發表食記，先提供餐廳名稱、消費時間、地址、電話、營業時間，消費時間隨機選2022年以前的月份...


### Stopwords Removal  

In [5]:
def remove_urls_and_phones(text):
    """
    移除文字中的網址和電話號碼。
    """
    # 正則表達式匹配網址
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)

    # 正則表達式匹配電話號碼（適用於多種常見格式）
    phone_pattern = r'(\d{2,4}[-.\s]??\d{3,4}[-.\s]??\d{3,4}|\(\d{2,4}\)\s*\d{3,4}[-.\s]??\d{3,4}|\d{10,11})'
    text = re.sub(phone_pattern, '', text)

    return text

In [6]:
def remove_english(text):
    """
    移除文字中的所有英文字符。
    """
    # 正則表達式匹配所有英文字母和英文單詞
    pattern = r'[A-Za-z]+'
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

In [7]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # 表情符號
        "\U0001F300-\U0001F5FF"  # 符號和圖案
        "\U0001F680-\U0001F6FF"  # 交通和符號
        "\U0001F700-\U0001F77F"  # 藝術符號
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r'', text)

In [8]:
def load_dictionary(file_path):
    with open(file_path, encoding='utf-8', mode='r') as f:
        dictionary = [l.strip() for l in f]
    return dictionary

In [9]:
def remove_stopwords(text, stopwords):
    # 使用列表推導式過濾掉停用詞
    filtered_words = remove_urls_and_phones(text)
    filtered_words = remove_english(filtered_words)
    filtered_words = remove_emojis(filtered_words)
    filtered_words = "".join(c for c in filtered_words if c not in ('；','，','。','！','：','「','」','…','、','？','【','】','.',':','?',';','!','~','`','+','-','<','>','/','[',']','{','}',"'",'"','\\', ' ', '‧','・','◢','◤','\n','★','☆','◆','◇','◎','○','●','◐','◑','▲','▼','△','▽','◢','◣','◥','◤','▷','◁','▶','◀','♠','♣','♥','♦','♨','⊙','⊕','▨','▧','▦','▥','▤','▣','▢','□','■'))
    filtered_words = "".join(word for word in filtered_words if word not in stopwords)
    filtered_words = filtered_words.replace(u'\u3000', u' ')
    # 將過濾後的單詞列表重新組合成字符串
    return filtered_words

### Test

In [10]:
stop_words = load_dictionary('../Data/stopwords_tc.txt')
breeze_remove_stopword_contents = []
for c in breeze_food_post_df['content']:
    breeze_remove_stopword_contents.append(remove_stopwords(c, stop_words))

# 添加標記並合併 DataFrame
breeze_food_post_df = pd.DataFrame(breeze_remove_stopword_contents, columns=['remove_stopword_content'])
breeze_food_post_df['source'] = 'breeze'
breeze_food_post_df

Unnamed: 0,remove_stopword_content,source
0,食記台南片周賣天美味披薩店位德家天家分享家位台南美食天堂片家店周周開天想吃注意時間店家資訊店...,breeze
1,食記台北辰壽司割烹閘蟹宴美食壇發表食記先提供餐廳名稱消費時間前址電話營業時間引言招呼量詳細介...,breeze
2,食記新北金鋤壽喜燒鍋物新莊店午茶位天分享新北新莊金鋤壽喜燒鍋物新莊店吃午茶心間壽喜燒次嘗試家...,breeze
3,食記新北林口林口井點點心消費時間年月址新北市林口區文化路段號林口井電話營業時間週週日林口井點...,breeze
4,食記台北松山夯魯肉飯原分埔魯肉飯美食壇發表食記先提供餐廳名稱消費時間前址電話營業時間引言招呼...,breeze
...,...,...
1002,食記台北中山區欣葉台菜創始店位德家天家分享家位台北市中山區欣葉台菜創始店果喜歡吃道台灣料理錯...,breeze
1003,食記台北信義寓咖啡咖啡廳消費時間年月址台北市信義區忠孝東路段號樓電話營業時間週週日午晚天朋友...,breeze
1004,食記屏東潮州牛福屏東牛肉料理天家分享位屏東潮州牛福家店牛肉料理聞名次食指動店家資訊餐廳名稱牛...,breeze
1005,食記評肉次夯消費時間年月址台北市萬華區中華路段巷號肉次台北市安區信義路段號夯電話肉次夯營業時...,breeze


### Vectorlize

In [11]:
remove_stopword_contents = breeze_food_post_df['remove_stopword_content']
remove_stopword_contents_list = remove_stopword_contents.to_list()
remove_stopword_contents_list

['食記台南片周賣天美味披薩店位德家天家分享家位台南美食天堂片家店周周開天想吃注意時間店家資訊店名片址台南市東區中華東路段號電話營業時間週期公休建議先電聯價位約造訪年月首先片位置東區中華東路段附少停車場停店外裝潢義利風情進門充期入座服務生馬送菜裡披薩類超義利麵燉飯主食點份海鮮總匯披薩份牛肉肉醬燉飯海鮮總匯披薩桌鮮蝦干貝魷魚花枝超澎湃餅皮薄脆料口吃海味牛肉肉醬燉飯遑燉飯粒粒分明牛肉軟嫩入味搭配濃郁醬真太幸福飲料部分點杯檸檬愛玉酸酸甜甜解膩片甜點提拉米蘇咖啡香十足配綿密鮮奶油完美收尾整體說片披薩燉飯非美味份量足值高次台南訪記注意營業時間',
 '食記台北辰壽司割烹閘蟹宴美食壇發表食記先提供餐廳名稱消費時間前址電話營業時間引言招呼量詳細介紹道菜價格食感想標題食記台北辰壽司割烹閘蟹宴消費時間年月餐廳名稱辰壽司割烹址台北市安區信義路段號電話營業時間引言辰壽司割烹家提供閘蟹宴日料理店位台北市安區信義路次年月造訪品嚐季節限閘蟹饗宴前菜刺身拼盤夾鮭魚鮪魚旗魚新鮮刺身搭配醬油山葵味道鮮美茶碗蒸柔軟蒸蛋中加入鮮蝦干貝口感豐富海茶泡飯蝦仁香菇雞蛋食材製成茶泡飯滋味濃郁主菜閘蟹隻肉質鮮美彈牙閘蟹搭配醋飲風味佳松葉蟹肉卷松葉蟹肉蔬菜包裹壽司捲口感清爽蟹肉玉子燒柔軟蛋皮中包鮮美蟹肉滋味豐富蟹肉炒烏龍麵蟹肉蔬菜烏龍麵拌炒成味道濃郁蟹肉炒飯粒粒分明炒飯中加入鮮甜蟹肉風味特蟹肉味噌湯閘蟹殼熬製味噌湯滋味醇厚甜點抹茶紅豆麻糬軟糯麻糬搭配香滑抹茶醬香甜紅豆泥味道諧水果盤季新鮮水果清甜爽口總結辰壽司割烹閘蟹宴鮮美食材精湛廚藝特色度難忘餐體驗閘蟹肉質鮮美彈牙搭配醋飲風味佳配菜刺身茶碗蒸海茶泡飯表現俗甜點抹茶紅豆麻糬水果盤餐畫圓句號推薦指數',
 '食記新北金鋤壽喜燒鍋物新莊店午茶位天分享新北新莊金鋤壽喜燒鍋物新莊店吃午茶心間壽喜燒次嘗試家參考址新北市新莊區新豐街號樓電話⏰營業時間價位午茶時段約記加成服務費造訪年月首先金鋤壽喜燒鍋物新莊店位置新莊新豐街樓附停車場停裝潢走日式風格氣氛溫馨次品嚐午茶時段提供壽喜燒套餐兩式選點式沙拉壽喜燒肉飯味噌湯甜點飲料首先沙拉生菜新鮮醬汁味道錯壽喜燒肉部分肉片少肉質軟嫩沾醬搭飯現煮粒粒分明搭配壽喜燒肉吃飯味噌湯喝清爽料甜點抹茶奶酪口感滑茶味濃郁飲料選綠茶解膩回甘整體說金鋤壽喜燒鍋物新莊店壽喜燒午茶表現錯食材新鮮味道符合胃口次機想試試晚餐附菜家參考餐心希家幫助',
 '食記新北林

In [12]:
def jieba_cut(text):
    stop_words = {'食記', '網誌'}
    words = jieba.cut(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorlize(posts):

    # 將每篇文章進行切詞
    cut_posts = [jieba_cut(post) for post in posts]

    with open('../Data/Model/tfidf_vectorizer.pkl', 'rb') as f:
        loaded_vectorizer = pickle.load(f)

    # 將切詞後的文章內容轉換為 TF-IDF 矩陣
    tfidf_matrix = loaded_vectorizer.transform(cut_posts)

    # 獲取特徵名稱（詞彙）
    feature_names = loaded_vectorizer.get_feature_names_out()

    # 將 TF-IDF 矩陣轉換為 DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=range(len(posts)), columns=feature_names)

    return tfidf_df

In [14]:
# 提取文章內容並進行 TF-IDF 向量化
tfidf_df = tfidf_vectorlize(breeze_food_post_df['remove_stopword_content'])

# 將標記欄位添加回 TF-IDF DataFrame 中
tfidf_df['source'] = breeze_food_post_df['source']
tfidf_df.to_csv('../Data/breeze_tfidf_vector.csv')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/rh/7g0n3djn3wj8dlnjv7plmzcr0000gn/T/jieba.cache
Loading model cost 0.150 seconds.
Prefix dict has been built succesfully.


In [15]:
def word2vec_vectorlize(posts):
    model = gensim.models.KeyedVectors.load_word2vec_format('../IgnoreFile/tmunlp_1.6B_WB_50dim_2020v1.bin.gz', 
                                                        unicode_errors='ignore', 
                                                        binary=True)
    
    # 將每篇文章進行切詞
    result = []
    cut_posts = [jieba_cut(post) for post in posts]
    for cut_post in cut_posts:
        words = cut_post.split()
        vector_size = model.vector_size
        sentence_vector = np.zeros(vector_size)
    
        count = 0
        for word in words:
            if word in model:
                sentence_vector += model[word]
                count += 1
        if count > 0:
            sentence_vector /= count
        result.append(sentence_vector)
    
    return result

In [16]:
word2vec_vecotrs = word2vec_vectorlize(breeze_food_post_df['remove_stopword_content'])
word2vec_df = pd.DataFrame(word2vec_vecotrs, columns=[f'v{i}' for i in range(len(word2vec_vecotrs[0]))])
word2vec_df['source'] = breeze_food_post_df['source']
word2vec_df.to_csv('../Data/breeze_word2vec_vector.csv')

In [17]:
# 加载词向量
word_embeddings = np.load('word_embeddings.npy')

# 构建词汇表
def build_vocab(posts):
    sentences = [jieba_cut(post) for post in posts]
    vocab = set(word for sentence in sentences for word in sentence.split())
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for word, i in word_to_idx.items()}
    return word_to_idx, idx_to_word

# 获取词向量的函数
def get_word_vector(word, word_to_idx, word_embeddings):
    idx = word_to_idx.get(word)
    if idx is not None:
        return word_embeddings[idx]
    else:
        return np.zeros(word_embeddings.shape[1])  # 返回全零向量

# 修改word2vec_vectorlize函数
def word2vec_vectorlize(posts, word_to_idx, word_embeddings):
    # 将每篇文章进行切词
    result = []
    cut_posts = [jieba_cut(post) for post in posts]
    for cut_post in cut_posts:
        words = cut_post.split()
        vector_size = word_embeddings.shape[1]
        sentence_vector = np.zeros(vector_size)
    
        count = 0
        for word in words:
            word_vector = get_word_vector(word, word_to_idx, word_embeddings)
            if np.any(word_vector):  # 检查是否为全零向量
                sentence_vector += word_vector
                count += 1
        if count > 0:
            sentence_vector /= count
        result.append(sentence_vector)
    
    return result

In [18]:
# 构建词汇表
word_to_idx, idx_to_word = build_vocab(breeze_food_post_df['remove_stopword_content'])

# 生成向量
vectors = word2vec_vectorlize(breeze_food_post_df['remove_stopword_content'], word_to_idx, word_embeddings)

# 创建新的DataFrame
word2vec_df = pd.DataFrame(vectors, columns=[f'v{i}' for i in range(len(vectors[0]))])
word2vec_df['source'] = breeze_food_post_df['source']
word2vec_df.to_csv('../Data/breeze_word2vec_vector_selftrain.csv')

In [19]:
def bert_vectorlize(posts):
    # 初始化 BERT 分詞器和模型
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertModel.from_pretrained('bert-base-chinese')

    # BERT to gpu
    model.to(device)

    # 將每篇文章進行切詞
    result = []
    cut_posts = [jieba_cut(post) for post in posts]

    for cut_post in cut_posts:
        inputs = tokenizer(cut_post, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1)
        result.append(sentence_embedding)
    return result

In [20]:
bert_vectors = bert_vectorlize(breeze_food_post_df['remove_stopword_content'])
# Create a DataFrame
bert_df = pd.DataFrame(torch.cat(bert_vectors).cpu().numpy(), columns=[f'v{i}' for i in range(len(bert_vectors[0][0]))])
bert_df['source'] = breeze_food_post_df['source']

# Save to CSV
bert_df.to_csv('../Data/breeze_bert_vector.csv', index=False)



In [21]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super(Decoder, self).__init__()
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        return self.linear(x)

In [22]:
# 使用 Jieba 進行切詞
def jieba_cut(text):
    stop_words = {'食記', '網誌'}
    words = jieba.lcut(text)
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# 初始化BERT模型和Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')
decoder = Decoder(hidden_size=768, vocab_size=tokenizer.vocab_size)

# 加载模型和解码器状态字典
model.load_state_dict(torch.load('../Data/Model/bert_model_mps.pth'))
decoder.load_state_dict(torch.load('../Data/Model/decoder_mps.pth'))
model.eval()  # 模型设置为评估模式
decoder.eval()

model.to(device)
decoder.to(device)

# 定义将分词后的文本转换为向量的函数
def tokens_to_vector(posts):
    results = []
    cut_posts = [jieba_cut(post) for post in posts]
    for cut_post in cut_posts:
        inputs = tokenizer(cut_post, is_split_into_words=True, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}  # 将数据移动到GPU
        with torch.no_grad():
            outputs = model(**inputs)
            # 提取[CLS]标记的向量
            sentence_vector = outputs.last_hidden_state[:, 0, :]  # 使用最后一层隐藏状态的[CLS]标记向量
        results.append(sentence_vector)
    return results



In [23]:
# 获取BERT向量
bert_vectors = tokens_to_vector(breeze_food_post_df['remove_stopword_content'])
# 将BERT向量转换为DataFrame
bert_vectors_cat = torch.cat(bert_vectors, dim=0)  # 拼接张量列表
bert_df = pd.DataFrame(bert_vectors_cat.cpu().numpy(), columns=[f'v{i}' for i in range(bert_vectors_cat.shape[1])])
bert_df['source'] = breeze_food_post_df['source']
# Save to CSV
bert_df.to_csv('../Data/breeze_bert_vector_self_train.csv', index=False)