In [35]:
import tensorflow as tf

text_file = tf.keras.utils.get_file(
    fname = 'fra-eng.zip',
    origin = "http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip",
    extract = True,
)

In [37]:
import pathlib 

text_file = pathlib.Path(text_file).parent / 'fra.txt'

In [39]:
print(text_file)


C:\Users\HP\.keras\datasets\fra.txt


In [41]:
import os

# Path to the extracted folder
extracted_path = os.path.join(os.path.expanduser('~'), '.keras', 'datasets', 'fra-eng_extracted')

# List files inside that folder
print("Files in extracted folder:", os.listdir(extracted_path))


Files in extracted folder: ['fra.txt', '_about.txt']


In [43]:
file_path = os.path.join(extracted_path, 'fra.txt')

with open(file_path, encoding='utf-8') as fp:
    text_pairs = [line.strip() for line in fp]

# Optional: show a sample
print("Example:", text_pairs[0])


Example: Go.	Va !


In [45]:
eng_sentences = []
fra_sentences = []

for line in text_pairs:
    eng, fra = line.split('\t')
    eng_sentences.append(eng)
    fra_sentences.append(fra)

print("English:", eng_sentences[:5])
print("French:", fra_sentences[:5])


English: ['Go.', 'Hi.', 'Run!', 'Run!', 'Who?']
French: ['Va !', 'Salut !', 'Cours\u202f!', 'Courez\u202f!', 'Qui ?']


In [54]:
import unicodedata
import re

def normalize(text):
    text = unicodedata.normalize("NFD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")  # remove accents
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [56]:
with open(file_path, encoding='utf-8') as fp:
    text_pairs = [normalize(line) for line in fp]


In [60]:
import random

# Assuming eng_sentences and fra_sentences are already created
test_pair = list(zip(eng_sentences, fra_sentences))

for _ in range(5):
    eng, fra = random.choice(test_pair)
    print(f"English: {eng}")
    print(f"French:  {fra}")
    print('-' * 30)


English: Asia is much larger than Australia.
French:  L'Asie est beaucoup plus grande que l'Australie.
------------------------------
English: I thought you went home.
French:  Je pensais que vous étiez allé chez vous.
------------------------------
English: She suddenly kissed me.
French:  Elle m'embrassa subitement.
------------------------------
English: Do you know why Tom doesn't like Mary?
French:  Sais-tu pourquoi Tom n'aime pas Mary ?
------------------------------
English: You're not safe here.
French:  Vous n'êtes pas en sécurité, ici.
------------------------------


In [70]:
eng_tokens, fre_tokens = set(), set()
eng_maxlen, fre_maxlen = 0, 0

for line in text_pairs:
    parts = line.split('\t')
    if len(parts) != 2:
        continue  # skip malformed lines

    eng, fre = parts
    eng_words = eng.split()
    fre_words = fre.split()

    # Add tokens to vocab sets
    eng_tokens.update(eng_words)
    fre_tokens.update(fre_words)

    # Update max lengths
    eng_maxlen = max(eng_maxlen, len(eng_words))
    fre_maxlen = max(fre_maxlen, len(fre_words))

# ✅ Final output
print(f"Total tokens in English: {len(eng_tokens)}")
print(f"Total tokens in French: {len(fre_tokens)}")
print(f"Maximum English sentence length: {eng_maxlen}")
print(f"Maximum French sentence length: {fre_maxlen}")


Total tokens in English: 0
Total tokens in French: 0
Maximum English sentence length: 0
Maximum French sentence length: 0


In [72]:
import pickle

with open("text-pair.pickel" , 'wb') as fp:
    pickle.dump(text_pairs, fp)