## 合并数据集

In [None]:
import os
import pandas as pd

def merge_csv():
    path = 'cleaned/'
    files = os.listdir(path)
    files_csv = [f for f in files if f.endswith('.csv')]
    df = pd.DataFrame()
    for file in files_csv:
        df = pd.concat([df, pd.read_csv(path + file)], axis=0)
    df.to_csv('novel.csv',  sep='|', index=False)

merge_csv()

In [None]:
import pandas as pd
import shutil
#df = pd.read_csv('small.csv', sep=',')
df.to_csv('small.csv', sep='|',index=False) )

In [1]:
import pandas as pd

# 假设您的CSV文件名为 'translations.csv' 且位于当前目录中
file_name = 'novel.csv'

# 读取CSV文件
df = pd.read_csv(file_name, names=['en', 'cn'])  # 我们指定列名，因为CSV没有头

# 创建一个新的列，包含翻译的字典
df['translation'] = df.apply(lambda row: {'en': row['en'], 'cn': row['cn']}, axis=1)

# 选择需要写入的列
output_df = df[['translation']]

# 写入到新的CSV文件中
output_df.to_csv('formatted_novel.csv', index=True, index_label='**translation**', header=True, encoding='utf-8')

print("文件已保存为 'formatted_novel.csv'.")

文件已保存为 'formatted_novel.csv'.


In [4]:
import pandas as pd

# 示例数据框
df = pd.DataFrame({
    'en': ["Hello, world!", "I love to travel around the world.", "Today is a beautiful day.", "Can you help me with my homework?", "The river flows gently through the valley."],
    'cn': ["你好，世界！", "我喜欢环游世界。", "今天是个美丽的一天。", "你能帮我做家庭作业吗？", "河水轻轻地流过山谷。"]
})

# 使用管道符 | 作为分隔符写入 CSV 文件
df.to_csv('./eval.csv', sep='|', index=False)

## 模型和简单的训练一下

## 分词

In [None]:
import pandas as pd
import jieba
import spacy
from tqdm import tqdm
import os

file_list = ["novel.csv"]
for file in file_list:
    df = pd.read_csv(file)
    
    # 处理中文
    cn_text = ' '.join(df['cn'].astype(str))
    cn_words = jieba.cut(cn_text)
    cn_unique_words = list(set(cn_words))
    
    # 将中文唯一词汇保存到文件
    with open('word_dict/' + file.replace('.csv', '_cn_dict.txt'), 'w', encoding='utf-8') as f:
        for word in cn_unique_words:
            f.write(word + '\n')
    
    # 处理英文
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 1000000000
    en_text = ' '.join(df['en'].astype(str))
    doc = nlp(en_text)
    
    en_unique_words = set()
    with tqdm(total=len(doc), desc="Tokenizing") as pbar:
        for token in doc:
            if not token.is_stop and token.is_alpha:
                en_unique_words.add(token.text.lower())
            pbar.update(1)
    
    # 将英文唯一词汇保存到文件
    with open('word_dict/' + file.replace('.csv', '_en_dict.txt'), 'w', encoding='utf-8') as f:
        for word in en_unique_words:
            f.write(word + '\n')

## Dataset

In [4]:
import jieba
from torchtext.data.utils import get_tokenizer

# 定义一个包装函数来使用jieba
def jieba_tokenizer(text):
    return list(jieba.cut(text))

# 现在使用这个函数来创建分词器
token_transform = {}
token_transform['en'] = get_tokenizer('spacy', language='en_core_web_sm')
token_transform['cn'] = jieba_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
import random
random_index = [155,963,1167]
for random_data in random_index:
    print(data[random_data])

['"Flory’s house was at the top of the maidan', ' close to the edge of the jungle. From the gate the maidan sloped sharply down', ' scorched and khaki-coloured', ' with half a dozen dazzling white bungalows scattered round it. All quaked', ' shivered in the hot air. There was an English cemetery within a white wall half-way down the hill', ' and near by a tiny tin-roofed church. Beyond that was the European Club', ' and when one looked at the Club — a dumpy one-storey wooden building — one looked at the real centre of the town. In any town in India the European Club is the spiritual citadel', ' the real seat of the British power', ' the Nirvana for which native officials and millionaires pine in vain. It was doubly so in this case', ' for it was the proud boast of Kyauktada Club that', ' almost alone of Clubs in Burma', ' it had never admitted an Oriental to membership. Beyond the Club', ' the Irrawaddy flowed huge and ochreous glittering like diamonds in the patches that caught the su

In [None]:
def ensure_two_elements(row):
    """
    Ensure each row has exactly two elements, source and target.
    
    Args:
    - row: A list containing source and target sentences.
    
    Returns:
    - A tuple with the first two elements if the row has at least two elements, 
      otherwise None.
    """
    if len(row) >= 2:
        return (row[0], row[1])  # Take first two elements
    else:
        return None  # Or handle missing data as needed


# Filter data to keep only rows with two or more elements
filtered_data = [ensure_two_elements(row) for row in data if ensure_two_elements(row) is not None]

# Define text_to_indices function
def text_to_indices(text, lang, vocab):
    """
    Convert text to indices using the provided vocabulary.
    
    Args:
    - text: The input text to be converted.
    - lang: The language of the text ('en' or 'cn').
    - vocab: The vocabulary object for the language.
    
    Returns:
    - A list of indices representing the text.
    """
    indices = [vocab[token] if token in vocab else vocab['<unk>'] for token in token_transform[lang](text)]
    return indices

# Convert texts to indices
indexed_data = [(text_to_indices(src, 'en', vocab_transform['en']),
                 text_to_indices(tgt, 'cn', vocab_transform['cn'])) for src, tgt in filtered_data]


In [37]:
len(indexed_data)

11665

In [38]:
random_index = [155,963,116]
for random_data in random_index:
    print(indexed_data[random_data])

([4, 140, 23, 279, 12, 32, 6, 686, 8, 6, 2043], [4, 1140, 4, 12, 4, 6, 4, 1470, 4, 9, 4, 6, 4, 1154, 5, 4, 1061, 4, 6, 4, 1148, 4, 6, 4, 4432, 4, 6827, 4, 2127, 4, 108])
([4, 140, 19, 174, 115, 879, 5, 111, 28, 174, 879, 6, 9192, 147, 54, 10736, 992, 5, 66, 2335, 77, 9, 3113, 8, 1799, 19, 348, 166, 30, 4135, 5, 475, 627, 432, 19, 501, 13, 6, 1777, 8, 6, 1446, 5, 68, 296, 36, 41, 177, 10051], [4, 411, 4, 40, 4, 65, 4, 66, 4, 1322, 4, 3421, 4, 3236, 5, 8])
([4, 27, 102, 10, 6, 1840, 10, 6689, 50, 9, 6571, 3360, 8, 5460], [4, 10, 4, 110, 4, 306, 4, 12, 4, 965, 4, 6, 4, 1389, 4, 33, 4, 626, 4, 1181])


In [40]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 定义一个collate函数来处理填充
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    
    # 使用clone().detach()来避免警告
    src_batch = pad_sequence([seq.clone().detach() for seq in src_batch], padding_value=PAD_IDX)
    tgt_batch = pad_sequence([seq.clone().detach() for seq in tgt_batch], padding_value=PAD_IDX)
    
    # 创建掩码
    src_mask = (src_batch != PAD_IDX).unsqueeze(1)
    tgt_mask = (tgt_batch != PAD_IDX).unsqueeze(1)
    
    # 确保目标序列的掩码形状与源序列一致
    tgt_mask = tgt_mask[:, :-1, :-1]  # 这里我们去掉最后一个<eos>标记及其对应的掩码
    
    return src_batch, tgt_batch, src_mask, tgt_mask

BATCH_SIZE = 16
train_dataset = TranslationDataset(indexed_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
len(train_dataset),len(train_dataloader),train_dataset[:3]

(11665,
 730,
 [([17, 3655, 3859, 8], [1193, 22, 890, 4867]),
  ([2948, 3005], [1193, 22, 890]),
  ([92, 6984, 110], [143, 8098, 22, 8108, 187])])

In [42]:
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

# 初始化嵌入层
d_model = 512  # 模型维度
src_embed = Embeddings(d_model, len(vocab_transform['en']))
tgt_embed = Embeddings(d_model, len(vocab_transform['cn']))

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import math

# Assuming vocab_transform, token_transform, src_embed, tgt_embed, and data are already defined

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + Variable(self.pe[:, :x.size(1)], requires_grad=False)

d_model = 512  # Or whatever dimension you're using for your model
pos_encoding = PositionalEncoding(d_model)

# Define special token indices
SOS_IDX = vocab_transform['en']['<sos>']
EOS_IDX = vocab_transform['en']['<eos>']
UNK_IDX = vocab_transform['en']['<unk>']

# Convert text to indices
def text_to_indices(text, lang, vocab):
    indices = [vocab[token] if token in vocab else UNK_IDX for token in token_transform[lang](text)]
    return torch.LongTensor([SOS_IDX] + indices + [EOS_IDX])

# Process data into indexed tensors, ensuring only pairs are used
indexed_data = []
for row in data:
    if len(row) >= 2:
        src, tgt = row[:2]  # Take only the first two elements
        indexed_data.append((text_to_indices(src, 'en', vocab_transform['en']),
                             text_to_indices(tgt, 'cn', vocab_transform['cn'])))
    # Optionally, you can log or handle rows with less than 2 elements

# Process data and add positional encoding
cnt = 0
for src_indices, tgt_indices in indexed_data:
    src_tensor = src_indices.unsqueeze(0)  # Add batch dimension
    tgt_tensor = tgt_indices.unsqueeze(0)
    
    # Get embeddings
    src_embedded = src_embed(src_tensor)
    tgt_embedded = tgt_embed(tgt_tensor)
    
    # Add positional encoding
    src_embedded_with_pos = pos_encoding(src_embedded)
    tgt_embedded_with_pos = pos_encoding(tgt_embedded)
    
    cnt += 1
    if cnt < 10:
        print(f"Source sentence embedding shape: {src_embedded_with_pos.shape}")
        print(f"Target sentence embedding shape: {tgt_embedded_with_pos.shape}")

Source sentence embedding shape: torch.Size([5000, 6, 512])
Target sentence embedding shape: torch.Size([5000, 6, 512])
Source sentence embedding shape: torch.Size([5000, 4, 512])
Target sentence embedding shape: torch.Size([5000, 5, 512])
Source sentence embedding shape: torch.Size([5000, 5, 512])
Target sentence embedding shape: torch.Size([5000, 7, 512])
Source sentence embedding shape: torch.Size([5000, 3, 512])
Target sentence embedding shape: torch.Size([5000, 3, 512])
Source sentence embedding shape: torch.Size([5000, 4, 512])
Target sentence embedding shape: torch.Size([5000, 3, 512])
Source sentence embedding shape: torch.Size([5000, 4, 512])
Target sentence embedding shape: torch.Size([5000, 5, 512])
Source sentence embedding shape: torch.Size([5000, 6, 512])
Target sentence embedding shape: torch.Size([5000, 5, 512])
Source sentence embedding shape: torch.Size([5000, 6, 512])
Target sentence embedding shape: torch.Size([5000, 5, 512])
Source sentence embedding shape: torch.S

## 示例

In [None]:
# 打印词汇表大小
print(f"English vocabulary size: {len(en_word_to_idx)}")
print(f"Chinese vocabulary size: {len(cn_word_to_idx)}")

# 定义模型参数
d_model = 512  # 模型维度
max_len = 1000  # 假设最大序列长度是1000

# 初始化嵌入层和位置编码层
src_embedding = Embeddings(d_model, len(en_word_to_idx))
tgt_embedding = Embeddings(d_model, len(cn_word_to_idx))
pos_encoding = PositionalEncoding(d_model, max_len)

# 示例：将一个句子转换为嵌入向量
# 假设我们有一句话"Hello, how are you?"
src_sentence = "Hello, how are you?"
tgt_sentence = "你好，你好吗？"

# 将句子转换为索引
src_indices = [en_word_to_idx.get(token, en_word_to_idx['<unk>']) for token in token_transform['en'](src_sentence)]
tgt_indices = [cn_word_to_idx.get(token, cn_word_to_idx['<unk>']) for token in token_transform['cn'](tgt_sentence)]

# 转换为张量
src_tensor = torch.LongTensor(src_indices).unsqueeze(0)  # 添加批次维度
tgt_tensor = torch.LongTensor(tgt_indices).unsqueeze(0)

# 获取嵌入并添加位置编码
src_embedded = pos_encoding(src_embedding(src_tensor))
tgt_embedded = pos_encoding(tgt_embedding(tgt_tensor))

print("Source sentence embedding shape:", src_embedded.shape)
print("Target sentence embedding shape:", tgt_embedded.shape)

# 假设你要使用这些嵌入向量进行进一步的处理，例如传递给Transformer模型的编码器或解码器

## 训练

In [25]:
def attention(query, key, value, mask=None):
    d_k = query.size(-1)

    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    p_attn = F.softmax(scores, dim=-1)

    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(query, key, value, mask=mask)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)


class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return self.a_2 * (x - mean) / torch.sqrt(std ** 2 + self.eps) + self.b_2


class SublayerConnection(nn.Module):
    def __init__(self, size):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)

    def forward(self, x, sublayer):
        return x + sublayer(self.norm(x))


def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.w_2(F.relu(self.w_1(x)))


class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)


class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)


class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory

        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)


class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(Transformer, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)


class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)


def init_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff)
    position = PositionalEncoding(d_model)
    return Transformer(
        Encoder(EncoderLayer(d_model, c(attn), c(ff)), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff)), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

In [53]:
model = init_model(len(vocab_transform['en']), len(vocab_transform['cn']))

In [54]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)  # 忽略填充标记
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

num_epochs = 1  # 或任何你认为合适的训练轮数

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt, src_mask, tgt_mask in train_dataloader:
        src, tgt = src.to(device), tgt.to(device)
        src_mask, tgt_mask = src_mask.to(device), tgt_mask.to(device)
        
        # 编码
        memory = model.encode(src, src_mask)
        
        # 解码
        output = model.decode(memory, src_mask, tgt[:, :-1], tgt_mask[:, :-1, :-1])
        
        # 生成
        output = model.generator(output)
        
        # 计算损失
        # 目标序列去掉第一个<sos>标记
        loss = criterion(output.view(-1, output.size(-1)), tgt[:, 1:].contiguous().view(-1))
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data_loader):.4f}")

  src_batch = pad_sequence([torch.tensor(seq) for seq in src_batch], padding_value=PAD_IDX)
  tgt_batch = pad_sequence([torch.tensor(seq) for seq in tgt_batch], padding_value=PAD_IDX)


RuntimeError: The size of tensor a (660) must match the size of tensor b (661) at non-singleton dimension 0