Data preprocessing

In [34]:
with open('endata.txt', 'r') as text_file:
    eng_sent = text_file.readlines()
eng_sent = [sent.replace('\n','') for sent in eng_sent]

In [35]:
with open('rusdata.txt', 'r') as text_file:
  ru_sent = text_file.readlines()

In [36]:
ru_sent = [sent.replace('\n','') for sent in ru_sent]

In [37]:
import pandas as pd

df = pd.DataFrame({'en':eng_sent,'ru':ru_sent})

In [38]:
training_data = df.iloc[:2643]
training_list = []

validation_data = df.iloc[2643:]
validation_list = []

In [39]:
for i, row in training_data.iterrows():
  temp_dict = {
          'en': row.en,
          'ru': row.ru
      }
  training_list.append(temp_dict)

temp_dict_1 = {
    'translation': training_list
}

for i, row in validation_data.iterrows():
  temp_dict = {
          'en': row.en,
          'ru': row.ru
      }
  validation_list.append(temp_dict)

temp_dict_2 = {
    'translation': validation_list
}

In [40]:
training_dataset = {}
training_dataset['train'] = temp_dict_1
training_dataset['validation'] = temp_dict_2

In [41]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
translator = pipeline("translation", model=model_checkpoint)
translator("this is a test")

[{'translation_text': 'Это тест.'}]

In [10]:
pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/897.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [42]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [43]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ru"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [44]:
tokenized_validation_list = []
tokenized_train_list = []
tokenized_datasets = {}

validation_function_output = preprocess_function(training_dataset['validation'])
training_function_output = preprocess_function(training_dataset['train'])

In [45]:
for i in range(len(validation_function_output['input_ids'])):
  temp_dict = {
      'input_ids' : validation_function_output['input_ids'][i],
      'attention_mask': validation_function_output['attention_mask'][i],
      'labels': validation_function_output['labels'][i]
  }
  tokenized_validation_list.append(temp_dict)

for i in range(len(training_function_output['input_ids'])):
  temp_dict = {
      'input_ids' : training_function_output['input_ids'][i],
      'attention_mask': training_function_output['attention_mask'][i],
      'labels': training_function_output['labels'][i]
  }
  tokenized_train_list.append(temp_dict)

In [46]:
tokenized_datasets['validation'] = tokenized_validation_list
tokenized_datasets['train'] = tokenized_train_list

In [47]:
tokenized_datasets['train'][0]

{'input_ids': [32, 4686, 55477, 480, 33121, 59, 4, 4382, 3, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [1227, 3464, 229, 2190, 13320, 15384, 575, 95, 12059, 3, 0]}

Fine tuning of the model

In [48]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [49]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [50]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [51]:
batch["labels"]

tensor([[ 1227,  3464,   229,  2190,  3759,    53,  7392,    92,  4495,   809,
         38765,   776,    95,   172,  7043,     3,     0],
        [23531,    11,    44,  3464,   229,  2190, 10783, 24432,   193,   732,
           457,     3,     0,  -100,  -100,  -100,  -100]])

In [52]:
batch["decoder_input_ids"]

tensor([[62517,  1227,  3464,   229,  2190,  3759,    53,  7392,    92,  4495,
           809, 38765,   776,    95,   172,  7043,     3],
        [62517, 23531,    11,    44,  3464,   229,  2190, 10783, 24432,   193,
           732,   457,     3,     0, 62517, 62517, 62517]])

In [53]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[1227, 3464, 229, 2190, 3759, 53, 7392, 92, 4495, 809, 38765, 776, 95, 172, 7043, 3, 0]
[23531, 11, 44, 3464, 229, 2190, 10783, 24432, 193, 732, 457, 3, 0, -100, -100, -100, -100]


In [54]:
!pip install sacrebleu



In [12]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [

In [55]:
import evaluate

metric = evaluate.load("sacrebleu")

In [56]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [57]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     

In [14]:
pip install accelerate -U



In [58]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"Gopal-finetuned-custom-en-to-ru",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=200,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [59]:
 from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [60]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [31]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.2806780338287354,
 'eval_bleu': 31.288952328211977,
 'eval_runtime': 2667.5783,
 'eval_samples_per_second': 0.387,
 'eval_steps_per_second': 0.006}

In [61]:
trainer.train()

Step,Training Loss
500,0.8681
1000,0.4533
1500,0.2497
2000,0.1399
2500,0.0849
3000,0.0581
3500,0.0434
4000,0.0343
4500,0.0282
5000,0.0239


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Checkpoint destination directory Gopal-finetuned-custom-en-to-ru/checkpoint-336 already exists and is non-empty. Saving will 

Step,Training Loss
500,0.8681
1000,0.4533
1500,0.2497
2000,0.1399
2500,0.0849
3000,0.0581
3500,0.0434
4000,0.0343
4500,0.0282
5000,0.0239


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


TrainOutput(global_step=8400, training_loss=0.12469503470829554, metrics={'train_runtime': 3742.4084, 'train_samples_per_second': 141.246, 'train_steps_per_second': 2.245, 'total_flos': 1.02743295787008e+16, 'train_loss': 0.12469503470829554, 'epoch': 200.0})

In [62]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


model.safetensors:   0%|          | 0.00/305M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1711693763.9a0f808d947e.1707.2:   0%|          | 0.00/9.22k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Bradpitt1234/Gopal-finetuned-custom-en-to-ru/commit/4585f6939a6ed039651dd20ad3a120a9708b8eac', commit_message='Training complete', commit_description='', oid='4585f6939a6ed039651dd20ad3a120a9708b8eac', pr_url=None, pr_revision=None, pr_num=None)

In [63]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.8298757076263428,
 'eval_bleu': 30.233214532312058,
 'eval_runtime': 72.935,
 'eval_samples_per_second': 15.521,
 'eval_steps_per_second': 0.247,
 'epoch': 200.0}

In [64]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="Bradpitt1234/Gopal-finetuned-custom-en-to-ru")

pipe('The digital signal processor (DSP) can be used for both noise cancellation and equalization based on the requirement.')

model.safetensors:   0%|          | 0.00/305M [00:00<?, ?B/s]

[{'translation_text': 'Цифровой сигнальный процессор (DSP) может быть использовался для шумоподавления и эквализации в зависимости от требования.'}]

In [67]:
pipe("I am going out with friends today, do not disturb me. I think weather is nice today")

[{'translation_text': 'Я встречаюсь с друзьями сегодня, не беспокойте меня. Я думаю, погода сегодня хороша.'}]