In [2]:
import requests
import zipfile

url = 'https://www.manythings.org/anki/fra-eng.zip'
r = requests.get(url)

with open('fra-eng.zip', 'wb') as f:
    f.write(r.content)

# with zipfile.ZipFile('fra-eng.zip', 'r') as zip_ref:
#     zip_ref.extractall('.')


BadZipFile: ignored

In [17]:
!pip install -q torchtext spacy==3
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.9/107.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.8/82.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for spacy [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for spacy (pyproject.toml) .

In [4]:
!zip /content/fra-eng.zip


zip error: Zip file structure invalid (/content/fra-eng.zip)


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import re

# 读入数据集
data_path = "./fra.txt"
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split('\n')

# 清洗数据
def preprocess_sentence(sentence):
    """
    Preprocesses a sentence by converting to lowercase, removing punctuation and non-alphabetic characters,
    and standardizing whitespace.
    """
    sentence = sentence.lower().strip()
    sentence = re.sub(r"[^a-zA-ZÀ-ÿ\s']", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = sentence.strip()
    return sentence

# 划分训练和测试集
def split_train_test(pair_list, test_size=0.1):
    """
    Splits a list of sentence pairs into train and test sets.
    """
    data = pd.DataFrame(pair_list, columns=["eng", "fra"])
    train, test = train_test_split(data, test_size=test_size)

    eng_train = list(train["eng"].values)
    fra_train = list(train["fra"].values)
    eng_test = list(test["eng"].values)
    fra_test = list(test["fra"].values)

    return eng_train, fra_train, eng_test, fra_test

# 建立英语和法语的词汇表
def build_tokenizer(sentence_list):
    """
    Builds a tokenizer and returns the tokenizer and the size of the vocabulary.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', lower=True)
    tokenizer.fit_on_texts(sentence_list)
    vocab_size = len(tokenizer.word_index) + 1
    return tokenizer, vocab_size

# 将文本转换为数字
def texts_to_sequences(texts, tokenizer):
    """
    Converts a list of texts to a padded matrix of sequences.
    """
    seqs = tokenizer.texts_to_sequences(texts)
    maxlen = max(len(seq) for seq in seqs)
    return tf.keras.preprocessing.sequence.pad_sequences(seqs, padding='post', maxlen=maxlen)

# 定义序列到序列模型
def seq2seq_model(input_vocab_size, output_vocab_size, hidden_units):
    # 定义编码器
    encoder_inputs = tf.keras.layers.Input(shape=(None,))
    encoder_embeddings = tf.keras.layers.Embedding(input_vocab_size, hidden_units)(encoder_inputs)
    encoder_lstm = tf.keras.layers.LSTM(hidden_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)
    encoder_states = [state_h, state_c]

    # 定义解码器
    decoder_inputs = tf.keras.layers.Input(shape=(None,))
    decoder_embeddings = tf.keras.layers.Embedding(output_vocab_size, hidden_units)(decoder_inputs)
    decoder_lstm = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)
    decoder_dense = tf.keras.layers.Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # 定义整个模型
    model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

# 模型评估和推理
def evaluate_model(model, eng_tokenizer, fra_tokenizer, sentence):
    """
    Applies the model to translate a given sentence from English to French.
    """
    sentence = preprocess_sentence(sentence)
    seqs = texts_to_sequences([sentence], eng_tokenizer)
    input_seq = seqs[0]
    states = model.encoder_model.predict(input_seq.reshape(1,-1))
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = fra_tokenizer.word_index['<start>']
    eos = fra_tokenizer.word_index['<end>']
    output_sentence = ""
    while True:
        output_tokens, h, c = model.decoder_model.predict([target_seq] + states)
        # 获取预测的下一个词
        idx = np.argmax(output_tokens[0, 0, :])
        # 结束条件
        if eos == idx or len(output_sentence) > 15:
            break
        word = ""
        # 根据序号转换成词
        if idx > 0:
            for w, i in fra_tokenizer.word_index.items():
                if i == idx:
                    word = w
                    break
        # 添加到输出语句中
        if len(word) > 0:
            if len(output_sentence) > 0:
                output_sentence += " "
            output_sentence += word
        # 更新解码器状态
        target_seq[0, 0] = idx
        states = [h, c]
    return output_sentence

# 主程序
if __name__ == '__main__':
    # 预处理数据
    pairs = []
    for line in lines:
        parts = line.split('\t')
        if len(parts) != 2:
            continue
        eng, fra = parts
        eng = preprocess_sentence(eng)
        fra = preprocess_sentence(fra)
        pairs.append((eng, fra))

    # 划分训练和测试集
    eng_train, fra_train, eng_test, fra_test = split_train_test(pairs, test_size=0.1)

    # 建立英语和法语的词汇表
    eng_tokenizer, eng_vocab_size = build_tokenizer(eng_train)
    fra_tokenizer, fra_vocab_size = build_tokenizer(fra_train)

    # 将文本转换为数字
    eng_train_seqs = texts_to_sequences(eng_train, eng_tokenizer)
    fra_train_seqs = texts_to_sequences(fra_train, fra_tokenizer)
    eng_test_seqs = texts_to_sequences(eng_test, eng_tokenizer)
    fra_test_seqs = texts_to_sequences(fra_test, fra_tokenizer)

    # 模型设置
    hidden_units = 256
    batch_size = 64
    num_epochs = 30
    val_split = 0.2

    # 构建并训练模型
    model = seq2seq_model(eng_vocab_size, fra_vocab_size, hidden_units)
    model.compile(optimizer=tf.keras.optimizers.Adam(), loss='sparse_categorical_crossentropy')
    model.fit(x=[eng_train_seqs, fra_train_seqs[:, :-1]], y=fra_train_seqs[:, 1:], batch_size=batch_size, epochs=num_epochs, validation_split=val_split)

    # 在测试集上评估模型
    loss = model.evaluate(x=[eng_test_seqs, fra_test_seqs[:, :-1]], y=fra_test_seqs[:, 1:])
    print("Test loss:", loss)

    # 进行翻译
    print(evaluate_model(model, eng_tokenizer, fra_tokenizer, "I am a student."))
    print(evaluate_model(model, eng_tokenizer, fra_tokenizer, "What is your name?"))
    print(evaluate_model(model, eng_tokenizer, fra_tokenizer, "How are you doing?"))


ValueError: ignored

In [14]:
# 读入数据集
data_path = "./fra.txt"
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split('\n')

import re

def preprocess_sentence(sentence):
    """
    Preprocesses a sentence by converting to lowercase, removing punctuation and non-alphabetic characters,
    and standardizing whitespace.
    """
    sentence = sentence.lower().strip()
    sentence = re.sub(r"[^a-zA-ZÀ-ÿ\s']", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = sentence.strip()
    return sentence

def preprocess_pairs(pair_list):
    """
    Preprocesses a list of sentence pairs.
    """
    preprocessed_pairs = []
    for pair in pair_list:
        eng, fra = pair.split("\t")
        preprocessed_eng = preprocess_sentence(eng)
        preprocessed_fra = preprocess_sentence(fra)
        preprocessed_pairs.append((preprocessed_eng, preprocessed_fra))
    return preprocessed_pairs

# Example usage
pair_list = ["Go.\tVa !", "Go.\tMarche.", "Go.\tEn route !"]
preprocessed_pairs = preprocess_pairs(pair_list)
print(preprocessed_pairs)


# 划分训练和测试集
data = pd.DataFrame(preprocessed_pairs, columns=["eng", "fra"])

[('go', 'va'), ('go', 'marche'), ('go', 'en route')]


In [15]:
data

Unnamed: 0,eng,fra
0,go,va
1,go,marche
2,go,en route
