# 自然语言rnn

In [16]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.rnn = nn.RNN(input_size, hidden_size,n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self,x,hidden):
        output, hidden = self.rnn(x,hidden)
        output = self.fc(output)
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size)

def process_data(text):
    chars = sorted(list(set(text)))
    char_to_idx = {
        ch: i for i, ch in enumerate(chars)
    }
    idx_to_char = {
        i: ch for i, ch in enumerate(chars)
    }
    return chars, char_to_idx, idx_to_char

def create_sequences(text, char_to_idx, seq_length):
    x = []
    y = []
    
    for i in range(0, len(text) - seq_length):
        sequence = text[i:i+seq_length]
        target = text[i+1:i+seq_length+1]
        
        x.append([char_to_idx[char] for char in sequence])
        y.append([char_to_idx[char] for char in target])
        
    x = torch.tensor(x)
    y = torch.tensor(y)
    return x, y

def train_model(model, data, targets, criterion, optimizer, vocab_size, batch_size=32):
    model.train()
    total_loss = 0
    
    # 修正：正确计算批次数量
    n_batches = len(data) // batch_size
    if n_batches == 0:  # 确保至少有一个批次
        n_batches = 1
        batch_size = len(data)
    
    for i in range(n_batches):
        start = i * batch_size
        end = min(start + batch_size, len(data))  # 防止越界
        batch_data = data[start:end]
        batch_targets = targets[start:end]
        
        current_batch_size = len(batch_data)  # 获取实际批次大小
        
        # 使用实际批次大小创建张量
        x = torch.zeros(current_batch_size, len(batch_data[0]), vocab_size)
        for i, sequence in enumerate(batch_data):
            for t, char_idx in enumerate(sequence):
                x[i, t, char_idx] = 1
                
        hidden = model.init_hidden(current_batch_size)
        
        output, hidden = model(x, hidden)
        
        loss = criterion(output.view(-1, vocab_size), batch_targets.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / n_batches

def generate_text(model, initial_str, char_to_idx, idx_to_char, vocab_size, pred_length=100):
    model.eval()
    current_str = initial_str
    hidden = model.init_hidden(1)
    
    with torch.no_grad():
        for _ in range(pred_length):
            x = torch.zeros(1, 1, vocab_size)
            x[0,0,char_to_idx[current_str[-1]]] = 1
            
            output, hidden = model(x,hidden)
            
            probs = torch.softmax(output[0,-1], dim=0).detach().numpy()
            char_idx = torch.multinomial(torch.tensor(probs), num_samples=1).item()
            
            current_str += idx_to_char[char_idx]
            
    return current_str
    
if __name__ == "__main__":
    text = """
    春眠不觉晓，
    处处闻啼鸟。
    夜来风雨声，
    花落知多少。
    """
    
    chars, char_to_idx, idx_to_char = process_data(text)
    
    seq_length = 10
    
    x,y = create_sequences(text,char_to_idx=char_to_idx, seq_length=seq_length)
    
    input_size = len(chars)
    hidden_size = 128
    n_layers = 2
    
    # 创建模型
    model = CharRNN(input_size=input_size, hidden_size=hidden_size, output_size=input_size, n_layers=n_layers)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    n_epochs = 100
    vocab_size = len(chars)
    
    for epoch in range(n_epochs):
        loss = train_model(model,x,y,criterion,optimizer,vocab_size)
        if(epoch + 1)%10 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
            
    print("训练完成")
    
    initial_str = "春眠不觉晓"
    generated_text = generate_text(model, initial_str, char_to_idx, idx_to_char, vocab_size)
    print(f"生成的文本:\n{generated_text}")


Epoch 10, Loss: 2.5392
Epoch 20, Loss: 2.1224
Epoch 30, Loss: 1.6201
Epoch 40, Loss: 1.2897
Epoch 50, Loss: 1.0433
Epoch 60, Loss: 0.8437
Epoch 70, Loss: 0.6600
Epoch 80, Loss: 0.5161
Epoch 90, Loss: 0.4118
Epoch 100, Loss: 0.3368
训练完成
生成的文本:
春眠不觉晓，
    处眠闻啼鸟。
    夜来风雨声，
    夜来风雨声。
    花落知多鸟。   来处
声，
    夜落知觉声，
    花来风雨声，
    夜落知声，，
    花闻，
 
  夜
