# RNN聊天機器人 - 混合回應系統

基於PyTorch的序列到序列模型，結合智能混合回應系統。

In [None]:
# 環境設置
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import re
import itertools
import numpy as np
import jieba
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# 檢查GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用設備: {device}")

# 設置隨機種子
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

print("環境設置完成")

  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")


使用設備: cuda
環境設置完成


In [None]:
# 安裝依賴
!pip install jieba matplotlib tqdm

# 設置中文字體
!apt-get update -qq
!apt-get install -y fonts-noto-cjk -qq

plt.rcParams['font.family'] = ['Noto Sans CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

print("依賴安裝完成")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package fonts-noto-cjk.
(Reading database ... 126666 files and directories currently installed.)
Preparing to unpack .../fonts-noto-cjk_1%3a20220127+repack1-1_all.deb ...
Unpacking fonts-noto-cjk (1:20220127+repack1-1) ...
Setting up fonts-noto-cjk (1:20220127+repack1-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
依賴安裝完成


In [None]:
# 配置參數
CONFIG = {
    'iterations': 800,
    'batch_size': 8,
    'learning_rate': 0.0001,
    'hidden_size': 64,
    'n_layers': 1,
    'dropout': 0.1,
    'print_every': 50,
    'clip': 50.0,
    'max_length': 6,
    'confidence_threshold': 0.3
}

print("配置完成")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

配置完成
  iterations: 800
  batch_size: 8
  learning_rate: 0.0001
  hidden_size: 64
  n_layers: 1
  dropout: 0.1
  print_every: 50
  clip: 50.0
  max_length: 6
  confidence_threshold: 0.3


In [None]:
# 數據和詞彙表
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Voc:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.num_words = 3
        self.UNK_token = None

    def addSentence(self, sentence):
        words = list(jieba.cut(sentence)) if re.search(r'[\u4e00-\u9fff]', sentence) else sentence.split()
        for word in words:
            if word.strip():
                self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def add_unknown_token(self):
        if "UNK" not in self.word2index:
            self.UNK_token = self.num_words
            self.word2index["UNK"] = self.num_words
            self.index2word[self.num_words] = "UNK"
            self.num_words += 1

# 訓練數據
sample_data = [
    ["你好", "你好"],
    ["hi", "Hello"],
    ["什麼是RNN", "循環神經網絡"],
    ["LSTM是什麼", "長短期記憶"],
    ["GRU特點", "門控循環單元"],
    ["什麼是AI", "人工智能技術"],
    ["人工智能", "模擬人類智能"],
    ["機器學習", "自動學習方法"],
    ["深度學習", "多層神經網絡"],
    ["謝謝", "不客氣"],
    ["再見", "再見"]
]

def normalizeString(s):
    s = str(s).lower().strip()
    s = re.sub(r"([.!?])", r" \1 ", s)
    return re.sub(r"\s+", r" ", s).strip()

# 構建詞彙表
voc = Voc("chatbot")
pairs = []

for q, a in sample_data:
    q = normalizeString(q)
    a = normalizeString(a)
    voc.addSentence(q)
    voc.addSentence(a)
    pairs.append([q, a])

voc.add_unknown_token()
print(f"詞彙表大小: {voc.num_words}")
print(f"訓練對數: {len(pairs)}")

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.837 seconds.
DEBUG:jieba:Loading model cost 0.837 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


詞彙表大小: 37
訓練對數: 11


In [None]:
# 簡化的模型定義
class SimpleEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(SimpleEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        output, hidden = self.gru(embedded)
        return output, hidden

class SimpleDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(SimpleDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_step, hidden):
        embedded = self.embedding(input_step)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(output)
        return F.softmax(output, dim=2), hidden

print("模型定義完成")

模型定義完成


In [None]:
# 混合回應系統
class HybridSystem:
    def __init__(self):
        self.responses = {
            r'你好|hi|hello': ["您好！我是AI助手。"],
            r'rnn|循環': ["RNN是處理序列數據的神經網絡。"],
            r'lstm|長短期': ["LSTM解決了梯度消失問題。"],
            r'gru|門控': ["GRU是LSTM的簡化版本。"],
            r'ai|人工智能|人工智慧|什麼是ai': ["AI（人工智能）是讓機器模擬人類智能的技術，包括學習、推理、感知等能力。"],
            r'機器學習|machine learning': ["機器學習是AI的一個分支，讓機器從數據中自動學習和改進。"],
            r'深度學習|deep learning': ["深度學習使用多層神經網絡來模擬人腦的學習過程。"],
            r'謝謝|thank': ["不客氣！"],
            r'再見|bye': ["再見！"]
        }

    def get_rule_response(self, user_input):
        user_input = user_input.lower()
        for pattern, responses in self.responses.items():
            if re.search(pattern, user_input, re.IGNORECASE):
                return random.choice(responses)
        return "這是一個有趣的問題。"

    def get_response(self, user_input, rnn_response=None):
        if rnn_response and len(rnn_response.strip()) > 2:
            return rnn_response
        return self.get_rule_response(user_input)

hybrid_system = HybridSystem()
print("混合系統初始化完成")

混合系統初始化完成


In [None]:
# 訓練函數
def sentence_to_indexes(voc, sentence):
    words = list(jieba.cut(sentence)) if re.search(r'[\u4e00-\u9fff]', sentence) else sentence.split()
    indexes = []
    for word in words:
        if word in voc.word2index:
            indexes.append(voc.word2index[word])
        else:
            indexes.append(voc.UNK_token if voc.UNK_token else 0)
    return indexes + [EOS_token]

def train_model():
    hidden_size = CONFIG['hidden_size']

    encoder = SimpleEncoder(voc.num_words, hidden_size).to(device)
    decoder = SimpleDecoder(voc.num_words, hidden_size).to(device)

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=CONFIG['learning_rate'])
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=CONFIG['learning_rate'])

    criterion = nn.CrossEntropyLoss()

    print("開始訓練...")

    for iteration in range(CONFIG['iterations']):
        # 隨機選擇訓練對
        pair = random.choice(pairs)
        input_sentence, target_sentence = pair

        # 轉換為索引
        input_indexes = sentence_to_indexes(voc, input_sentence)
        target_indexes = sentence_to_indexes(voc, target_sentence)

        # 轉換為張量
        input_tensor = torch.LongTensor([input_indexes]).to(device)
        target_tensor = torch.LongTensor(target_indexes).to(device)

        # 清零梯度
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # 編碼
        encoder_output, encoder_hidden = encoder(input_tensor)

        # 解碼
        decoder_input = torch.LongTensor([[SOS_token]]).to(device)
        decoder_hidden = encoder_hidden

        loss = 0
        for i in range(len(target_indexes)):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output.squeeze(0), target_tensor[i:i+1])
            decoder_input = target_tensor[i:i+1].unsqueeze(0)

        # 反向傳播
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        if iteration % CONFIG['print_every'] == 0:
            print(f"迭代 {iteration}, 損失: {loss.item():.4f}")

    print("訓練完成！")
    return encoder, decoder

encoder, decoder = train_model()

開始訓練...
迭代 0, 損失: 7.2160
迭代 50, 損失: 14.4192
迭代 100, 損失: 14.4162
迭代 150, 損失: 7.1508
迭代 200, 損失: 14.2855
迭代 250, 損失: 14.0396
迭代 300, 損失: 13.6954
迭代 350, 損失: 13.5993
迭代 400, 損失: 13.6348
迭代 450, 損失: 13.6211
迭代 500, 損失: 6.2851
迭代 550, 損失: 6.3339
迭代 600, 損失: 13.5605
迭代 650, 損失: 13.5958
迭代 700, 損失: 9.9422
迭代 750, 損失: 13.4824
訓練完成！


In [None]:
# 評估和對話
def generate_response(sentence):
    try:
        sentence = normalizeString(sentence)
        input_indexes = sentence_to_indexes(voc, sentence)
        input_tensor = torch.LongTensor([input_indexes]).to(device)

        with torch.no_grad():
            encoder_output, encoder_hidden = encoder(input_tensor)

            decoder_input = torch.LongTensor([[SOS_token]]).to(device)
            decoder_hidden = encoder_hidden

            decoded_words = []
            for _ in range(CONFIG['max_length']):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach().unsqueeze(0).unsqueeze(0)

                if topi.item() == EOS_token:
                    break
                elif topi.item() in voc.index2word:
                    decoded_words.append(voc.index2word[topi.item()])

            rnn_response = "".join(decoded_words) if any(re.search(r'[\u4e00-\u9fff]', w) for w in decoded_words) else " ".join(decoded_words)
            return hybrid_system.get_response(sentence, rnn_response)

    except Exception as e:
        return hybrid_system.get_rule_response(sentence)

# 測試
test_inputs = ["你好", "什麼是RNN", "LSTM是什麼", "什麼是AI", "人工智能", "機器學習", "謝謝", "再見"]

print("測試結果:")
for test_input in test_inputs:
    response = generate_response(test_input)
    print(f"輸入: {test_input} -> 回應: {response}")

測試結果:
輸入: 你好 -> 回應: 您好！我是AI助手。
輸入: 什麼是RNN -> 回應: RNN是處理序列數據的神經網絡。
輸入: LSTM是什麼 -> 回應: LSTM解決了梯度消失問題。
輸入: 什麼是AI -> 回應: AI（人工智能）是讓機器模擬人類智能的技術，包括學習、推理、感知等能力。
輸入: 人工智能 -> 回應: AI（人工智能）是讓機器模擬人類智能的技術，包括學習、推理、感知等能力。
輸入: 機器學習 -> 回應: 機器學習是AI的一個分支，讓機器從數據中自動學習和改進。
輸入: 謝謝 -> 回應: 不客氣！
輸入: 再見 -> 回應: 再見！


In [None]:
# 互動對話
def start_chat():
    print("混合回應系統聊天機器人已準備就緒！")
    print("輸入 'quit' 結束對話")
    print("=" * 30)

    while True:
        try:
            user_input = input("\n你: ").strip()

            if user_input.lower() in ['quit', 'exit', '退出']:
                print("機器人: 再見！")
                break

            if not user_input:
                continue

            response = generate_response(user_input)
            print(f"機器人: {response}")

        except KeyboardInterrupt:
            print("\n機器人: 再見！")
            break
        except Exception as e:
            print(f"機器人: 抱歉，出現問題。")

print("系統準備完成！")
start_chat()

系統準備完成！
混合回應系統聊天機器人已準備就緒！
輸入 'quit' 結束對話

你: 什麼是RNN
機器人: RNN是處理序列數據的神經網絡。

你: 什麼是LSTM
機器人: LSTM解決了梯度消失問題。

你: 什麼是AI
機器人: AI（人工智能）是讓機器模擬人類智能的技術，包括學習、推理、感知等能力。

你: 再見
機器人: 再見！

機器人: 再見！
