## 欢迎进入 ModelWhale Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [9]:
# 试试这个经典示例
print ("hello ModelWhale")

hello ModelWhale


In [10]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

cn-eng-train.txt	    translated_test1.txt       translation_LSTM.txt
cn-eng.txt		    translated_test_BIGRU.txt  validation_set
cn-eng-val.txt		    translated_testGRU.txt
formatted_translations.txt  translated_test.txt


In [11]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

cna8958


Reading lines...
Read 90000 sentence pairs
Trimmed to 68898 sentence pairs
Indexing words...
['這是個很重要的會議 ', 'this is a very important meeting .']


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1500 and 2000x10)

In [12]:
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

USE_CUDA = True

SOS_token = 0
EOS_token = 1



UNK_token = 2  # Index for the unknown word placeholder






In [13]:
import jieba

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"UNK": UNK_token}
        self.word2count = {"UNK": 0}
        self.index2word = {0: "SOS", 1: "EOS", UNK_token: "UNK"}
        self.n_words = 3  # Count SOS, EOS, and UNK

    def index_words(self, sentence):
        if self.name == 'cn':
            for word in jieba.cut(sentence):
                self.index_word(word)
        else:
            for word in sentence.split(' '):
                self.index_word(word)
    
        # 将单词加入词典并更新索引
    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1



# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z\u4e00-\u9fa5.!?，。？]+", r" ", s)
    return s


def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2)).read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

MAX_LENGTH = 10

def filter_pair(p):
    return len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]


def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %s sentence pairs" % len(pairs))

    pairs = filter_pairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))

    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepare_data('cn', 'eng', False)

# Print an example pair
print(random.choice(pairs))

# Return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    if lang.name == 'cn':
        return [lang.word2index.get(word, UNK_token) for word in sentence]
    else:
        return [lang.word2index.get(word, UNK_token) for word in sentence.split(' ')]
        
def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    var = torch.LongTensor(indexes).view(-1, 1)
    if USE_CUDA: var = var.cuda()
    return var

def variables_from_pair(pair):
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)



Reading lines...
Read 90000 sentence pairs
Trimmed to 68898 sentence pairs
Indexing words...
['他不喝咖啡。', 'he doesn t drink coffee .']


In [14]:
class BioEncoderLSTM(nn.Module):
    """
    双向LSTM编码器

    Args:
        input_size (int): 输入数据的大小
        hidden_size (int): 隐藏层大小
        n_layers (int): LSTM层数，默认为1
    """

    def __init__(self, input_size, hidden_size, n_layers=1):
        super(BioEncoderLSTM, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        # 设置双向LSTM
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers, bidirectional=True)

    def forward(self, word_inputs, hidden):
        seq_len = len(word_inputs)
        # 将输入的词嵌入表示
        embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
        # 使用LSTM处理嵌入表示
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        # num_directions 设置为2
        num_directions = 2
        hidden = (torch.zeros(self.n_layers * num_directions, 1, self.hidden_size),
                  torch.zeros(self.n_layers * num_directions, 1, self.hidden_size))
        if USE_CUDA:
            hidden = (hidden[0].cuda(), hidden[1].cuda())
        return hidden





In [15]:
class DecoderLSTM(nn.Module):
    """
    LSTM解码器

    Args:
        hidden_size (int): 隐藏层大小
        output_size (int): 输出数据的大小
        n_layers (int): LSTM层数，默认为1
        dropout_p (float): dropout概率，默认为0.1
    """

    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(DecoderLSTM, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(output_size, self.hidden_size)
        self.rnn = nn.LSTM(self.hidden_size, self.hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(self.hidden_size, output_size)

    def forward(self, word_input, last_hidden):
        # 将输入的词嵌入表示
        word_embedded = self.embedding(word_input).view(1, 1, -1)
        # 使用LSTM处理嵌入表示
        rnn_output, hidden = self.rnn(word_embedded, last_hidden)

        rnn_output = rnn_output.squeeze(0)
        # 输出通过线性层并进行log softmax
        output = F.log_softmax(self.out(rnn_output), dim=1)

        return output, hidden

In [16]:




teacher_forcing_ratio = 0.5
clip = 5.0


def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH):
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0  # Added onto for each word

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Prepare input and output variables
    decoder_input = torch.LongTensor([[SOS_token]])
    decoder_hidden = encoder_hidden  # Use last hidden state from encoder to start decoder
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:

        # Teacher forcing: Use the ground-truth target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Next target is next input

    else:
        # Without teacher forcing: use network's own prediction as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_variable[di])

            # Get most likely word index (highest value) from output
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = torch.LongTensor([[ni]])  # Chosen word is next input
            if USE_CUDA: decoder_input = decoder_input.cuda()

            # Stop at end of sentence (not necessary when using known targets)
            if ni == EOS_token: break

    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length





def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))


hidden_size = 500
n_layers = 2
dropout_p = 0.05


# Initialize models
encoder = BioEncoderLSTM(input_lang.n_words, hidden_size, n_layers)
decoder = DecoderLSTM(hidden_size, output_lang.n_words, n_layers*2, dropout_p=dropout_p)


if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Initialize optimizers and criterion
learning_rate = 0.0001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()


# Configuring training
n_epochs = 100000
plot_every = 200
print_every = 1000

# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every


# Begin!
for epoch in range(1, n_epochs + 1):

    # Get training data for this cycle
    training_pair = variables_from_pair(random.choice(pairs))
    input_variable = training_pair[0]
    target_variable = training_pair[1]

    # Run the train function
    loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss

    if epoch == 0: continue

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (
        time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)

    if epoch % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0



import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2) # put ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

show_plot(plot_losses)

def evaluate(sentence, max_length=MAX_LENGTH):
    input_variable = variable_from_sentence(input_lang, sentence)
    input_length = input_variable.size()[0]

    # Run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # Create starting vectors for decoder
    decoder_input = torch.LongTensor([[SOS_token]])  # SOS
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.item()])

        # Next input is chosen word
        decoder_input = torch.LongTensor([[ni]])
        if USE_CUDA: decoder_input = decoder_input.cuda()

    return decoded_words



def evaluate_randomly():
    pair = random.choice(pairs)

    output_words = evaluate(pair[0])
    output_sentence = ' '.join(output_words)

    print('>', pair[0])
    print('=', pair[1])
    print('<', output_sentence)
    print('')

evaluate_randomly()




  torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
  torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)


0m 34s (- 56m 33s) (1000 1%) 5.3144
1m 9s (- 56m 36s) (2000 2%) 4.9474
1m 44s (- 56m 24s) (3000 3%) 4.7195
2m 20s (- 56m 3s) (4000 4%) 4.6429
2m 55s (- 55m 34s) (5000 5%) 4.5332
3m 30s (- 55m 3s) (6000 6%) 4.4511
4m 6s (- 54m 31s) (7000 7%) 4.4353
4m 41s (- 54m 1s) (8000 8%) 4.2858
5m 17s (- 53m 30s) (9000 9%) 4.2991
5m 52s (- 52m 55s) (10000 10%) 4.1819
6m 28s (- 52m 23s) (11000 11%) 4.1250
7m 3s (- 51m 47s) (12000 12%) 4.1247
7m 39s (- 51m 12s) (13000 13%) 4.0659
8m 14s (- 50m 38s) (14000 14%) 4.0445
8m 50s (- 50m 3s) (15000 15%) 3.9777
9m 25s (- 49m 28s) (16000 16%) 3.9517
10m 0s (- 48m 53s) (17000 17%) 3.9625
10m 35s (- 48m 17s) (18000 18%) 3.9045
11m 11s (- 47m 42s) (19000 19%) 3.8509
11m 46s (- 47m 6s) (20000 20%) 3.8515
12m 22s (- 46m 31s) (21000 21%) 3.7655
12m 57s (- 45m 57s) (22000 22%) 3.8324
13m 33s (- 45m 22s) (23000 23%) 3.7790
14m 8s (- 44m 46s) (24000 24%) 3.7115
14m 43s (- 44m 11s) (25000 25%) 3.7121
15m 18s (- 43m 35s) (26000 26%) 3.6285
15m 54s (- 43m 0s) (27000 27%)

<Figure size 640x480 with 0 Axes>

In [17]:
def sample_test_dataset(size=100):
    with open('cn-eng-test.txt', 'w+') as f:
        f.write('\n'.join(['\t'.join(pair) for pair in random.sample(pairs, k=size)]))

sample_test_dataset()


In [20]:
import collections
from torchtext.data.metrics import bleu_score


# 读取测试数据集
with open('/home/mw/project/cn-eng-test.txt') as f:
    lines = f.read().strip().split('\n')
    
    test_pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]



test_pairs_dict = collections.defaultdict(lambda : [])

for pair in test_pairs:
    test_pairs_dict[pair[0]].append(pair[1].split(' '))


def evaluate_bleu_score():
    candicates = []
    references = []

    for i, pair in enumerate(test_pairs_dict.items(), start=1):
        candicate = evaluate(pair[0])
        if candicate[-1] == '<EOS>':
            candicate.pop(-1)
        candicates.append(candicate)
        references.append(pair[1])
    
    score = bleu_score(candicates, references)
    return score

print('test dataset bleu score: %s' % evaluate_bleu_score())


test dataset bleu score: 0.15587388613826067


In [23]:
def translate_test_file(test_file_path, output_file_path):
    # 读取测试文件
    with open(test_file_path, 'r', encoding='utf-8') as test_file:
        lines = test_file.readlines()
    
    # 准备输出文件
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # 对每一行（句子）进行翻译
        for line in lines:
            line = line.strip()  # 去除可能的前后空格
            if not line:  # 跳过空行
                continue
            # 使用模型进行翻译
            output_words = evaluate(normalize_string(line))
            output_sentence = ' '.join(output_words[:-1])  # 去除<EOS>标记
            # 写入原句和翻译结果
            output_file.write(f'{line}\n{output_sentence}\n\n')
            
# 调用函数，传入测试文件路径和输出文件路径
translate_test_file('/home/mw/input/cna8958/test.txt', '/home/mw/project/translated_test_LSTM.txt')