In [7]:
import datasets
from datasets import load_dataset, inspect_dataset, load_dataset_builder, VerificationMode

In [9]:
builder = load_dataset_builder(
    './wmt14/wmt_utils.py',
    language_pair = ('fr', 'en'),
    subsets = {
        datasets.Split.TRAIN: ["europarl_v7"],
        datasets.Split.VALIDATION: ['newstest2013'],
        datasets.Split.TEST: ['newstest2014']
    },
    cache_dir = '/data2/brian/.cache/dataset'
)
# datasets.DatasetBuilder class

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [10]:
builder.download_and_prepare(verification_mode=VerificationMode.NO_CHECKS)

In [11]:
dataset = builder.as_dataset()

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 2002756
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

In [33]:
dataset['train'][0]['translation']

{'en': 'Resumption of the session', 'fr': 'Reprise de la session'}

In [50]:
import random
prompt = """Translate the following French sentence to English\n"""
template = ""
for i in range(5):
    random_num = random.randint(0, len(dataset['train']))
    random_sample = dataset['train'][random_num]['translation']
    template += prompt + f"French: {random_sample['fr']}\nEnglish: {random_sample['en']}\n"

In [51]:
print(template)

Translate the following French sentence to English
French: L’introduction de la durée limitée de validité permettra de mettre un terme à la dernière obstruction à la libre circulation en ce domaine.
English: The introduction of the limited period of validity will enable us to remove the final obstacle to free movement in this area.
Translate the following French sentence to English
French: Au sein de l'Union européenne, nous sommes arrivés à un stade où nous pouvons nous dire: "Au travail! Il faut recommencer!
English: We are now at a time in the European Union when we can say 'back to work, Europe'; let us go back, let us work, let us operate.
Translate the following French sentence to English
French: Le clonage thérapeutique, ou plus précisément l'utilisation à des fins thérapeutiques de cellules souches embryonnaires obtenues par les techniques de clonage, est un exemple de questions éthiques soulevées par les rapides avancées scientifiques dans les sciences de la vie.
English: Ther

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.2')

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f""""""
        output_texts.append(text)
    return output_texts

In [1]:
with open('doc-data/europarl-v7.fr-en.en', 'r') as f:
    en_text = f.readlines()
with open('doc-data/europarl-v7.fr-en.fr', 'r') as f:
    fr_text = f.readlines()

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloomz-7b1', cache_dir = '/data2/brian/.cache')

In [6]:
en_text[:19]

['Resumption of the session\n',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.\n',
 "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.\n",
 'You have requested a debate on this subject in the course of the next few days, during this part-session.\n',
 "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.\n",
 "Please rise, then, for this minute' s silence.\n",
 "(The House rose and observed a minute' s silence)\n",
 'Madam President, on a point of order.\n',
 'You will be aware from the press and television th

In [42]:
sent_data = Dataset.from_dict({'en': en_text, 'fr': fr_text})

In [45]:
sent_data.save_to_disk('fr-en/hf')

Saving the dataset (2/2 shards): 100%|██████████| 2007723/2007723 [00:01<00:00, 1632848.03 examples/s]


In [28]:
import os
file_names = list(map(os.path.basename, os.listdir('French-English/English')))

In [29]:
en_path = lambda x: f"French-English/English/{x}"
fr_path = lambda x: f"French-English/French/{x}"

In [30]:
with open(en_path(file_names[2])) as f:
    text = f.read()

In [24]:
from bs4 import BeautifulSoup
import unicodedata
cleantext = BeautifulSoup(text, "lxml").text
cleantext = unicodedata.normalize('NFKC', cleantext)
cleantext = cleantext.replace(u'\\u0027', "'")
cleantext

'Look East To Save Europe\'s Social Market\n\n\nAs expansion of the EU approaches, many Europeans see in it only things to be feared: masses of economic migrants, and poor countries demanding subsidies.\nBut Europe\'s new eastern members can also act as a beacon for the Union, as Jacques Rupnik suggests.\n\nIt is often argued that Continental Europe\'s social and economic model, which seeks to combine competitiveness with solidarity, is the glue that binds the European Union together, as well as distinguishing Europe from the American (or Anglo-Saxon) free-market model.\nClearly, Europe\'s answer to globalization is that certain spheres of social life-say, healthcare, education, the environment, or culture-cannot be left to the whip of the market.\n\nOn the surface it seems that Europe\'s steady integration proceeded in parallel with the development of the welfare state.\nBut this is misleading: the European social model is, in fact, part and parcel of the identity of the EU member sta

In [33]:
import re
text_preprocessed = re.sub('\n+', ' ', re.sub('\<.*\>', '', text))
text_preprocessed

'Look East To Save Europe\\u0027s Social Market As expansion of the EU approaches, many Europeans see in it only things to be feared: masses of economic migrants, and poor countries demanding subsidies. But Europe\'s new eastern members can also act as a beacon for the Union, as Jacques Rupnik suggests. It is often argued that Continental Europe\'s social and economic model, which seeks to combine competitiveness with solidarity, is the glue that binds the European Union together, as well as distinguishing Europe from the American (or Anglo-Saxon) free-market model. Clearly, Europe\'s answer to globalization is that certain spheres of social life-say, healthcare, education, the environment, or culture-cannot be left to the whip of the market. On the surface it seems that Europe\'s steady integration proceeded in parallel with the development of the welfare state. But this is misleading: the European social model is, in fact, part and parcel of the identity of the EU member states more 

In [34]:
def read_and_clean_doc(path):
    with open(path, 'r') as f:
        text = f.read()
    text = BeautifulSoup(text, "lxml").text
    text = unicodedata.normalize('NFKC', text)
    text = text.replace(u'\\u0027', "'")
    text = re.sub('\n+', ' ', re.sub('\<.*\>', '', text))
    return text

In [8]:
read_and_clean_doc(fr_path(file_names[0]))

NameError: name 'unicodedata' is not defined

In [32]:
read_and_clean_doc(fr_path(file_names[1]))

NameError: name 're' is not defined

In [48]:
en_doc = [read_and_clean_doc(en_path(path)) for path in file_names]
fr_doc = [read_and_clean_doc(fr_path(path)) for path in file_names]

In [51]:
doc_data = Dataset.from_dict({'en': en_doc, 'fr': fr_doc})

In [53]:
doc_data.save_to_disk('French-English/hf')

Saving the dataset (1/1 shards): 100%|██████████| 4794/4794 [00:00<00:00, 92718.51 examples/s]


# Test Set

In [19]:
with open('doc-data/news.2013.fr.shuffled', 'r') as f: 
    val_data = f.readlines()
with open('doc-data/news.2014.fr.shuffled.v2', 'r') as f: 
    test_data = f.readlines()

["Au pied des pyramides de Gizeh, l'air est exactement le même.\n",
 "L'un des effets collatéraux du cancer est la détresse psychologique.\n",
 "Les Thrashers, qui avaient le deuxième choix, ont donc accepté d'échanger ce choix aux Canucks en retour du tout premier.\n",
 "Selon lui, le président des États-Unis, Barack Obama et la présidente de la Commission d'enquête sur l'octroi et la gestion des contrats publics dans l'industrie de la construction, la juge France Charbonneau, sont deux des trois personnalités qui ont occupé l'actualité, aux côtés de la première ministre du Québec, Pauline Marois.\n",
 'Une seule de ses plumes tombée à terre, avait, dit-on, suffi à colorer le monde... Mais comment trouver sa majesté ?\n',
 "En plus de cette rémunération de très nombreux avantages viendront s'y ajouter (Participation et Intéressement, CE, .....)\n",
 "Depuis cet été, cependant, de belles choses semblent poindre à l'horizon pour le jeune homme de 19 ans.\n",
 '"Notre but à tous les deux

# Model Load Test

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2', cache_dir = '/data2/brian/.cache')
model = AutoModelForCausalLM.from_pretrained('microsoft/phi-2', cache_dir = '/data2/brian/.cache', trust_remote_code = True,)

tokenizer_config.json: 100%|██████████| 7.34k/7.34k [00:00<00:00, 12.9MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.05MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.18MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 11.9MB/s]
added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<00:00, 2.63MB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 247kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
config.json: 100%|██████████| 863/863 [00:00<00:00, 2.15MB/s]
configuration_phi.py: 100%|██████████| 9.26k/9.26k [00:00<00:00, 18.4MB/s]
A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
modeling_phi.py: 100%|██████████| 62.7k/62.7k [00:00<00:0

KeyboardInterrupt: 

In [6]:
from datasets import load_dataset, Dataset

In [7]:
Dataset.load_from_disk('/data2/brian/personal/translation/fr-en/hf')

Dataset({
    features: ['en', 'fr'],
    num_rows: 2007723
})