Data preprocessing

In [None]:
with open('endata.txt', 'r') as text_file:
    eng_sent = text_file.readlines()
eng_sent = [sent.replace('\n','') for sent in eng_sent]

In [None]:
with open('rusdata.txt', 'r') as text_file:
  ru_sent = text_file.readlines()

In [None]:
ru_sent = [sent.replace('\n','') for sent in ru_sent]

In [None]:
import pandas as pd

df = pd.DataFrame({'en':eng_sent,'ru':ru_sent})

In [None]:
training_data = df.iloc[:2643]
training_list = []

validation_data = df.iloc[2643:]
validation_list = []

In [None]:
for i, row in training_data.iterrows():
  temp_dict = {
          'en': row.en,
          'ru': row.ru
      }
  training_list.append(temp_dict)

temp_dict_1 = {
    'translation': training_list
}

for i, row in validation_data.iterrows():
  temp_dict = {
          'en': row.en,
          'ru': row.ru
      }
  validation_list.append(temp_dict)

temp_dict_2 = {
    'translation': validation_list
}

In [None]:
training_dataset = {}
training_dataset['train'] = temp_dict_1
training_dataset['validation'] = temp_dict_2

In [None]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
translator = pipeline("translation", model=model_checkpoint)
translator("this is a test")

[{'translation_text': 'Это тест.'}]

In [None]:
pip install sacremoses

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [None]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ru"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [None]:
tokenized_validation_list = []
tokenized_train_list = []
tokenized_datasets = {}

validation_function_output = preprocess_function(training_dataset['validation'])
training_function_output = preprocess_function(training_dataset['train'])

In [None]:
for i in range(len(validation_function_output['input_ids'])):
  temp_dict = {
      'input_ids' : validation_function_output['input_ids'][i],
      'attention_mask': validation_function_output['attention_mask'][i],
      'labels': validation_function_output['labels'][i]
  }
  tokenized_validation_list.append(temp_dict)

for i in range(len(training_function_output['input_ids'])):
  temp_dict = {
      'input_ids' : training_function_output['input_ids'][i],
      'attention_mask': training_function_output['attention_mask'][i],
      'labels': training_function_output['labels'][i]
  }
  tokenized_train_list.append(temp_dict)

In [None]:
tokenized_datasets['validation'] = tokenized_validation_list
tokenized_datasets['train'] = tokenized_train_list

In [None]:
tokenized_datasets['train'][0]

Fine tuning of the model

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

In [None]:
batch["labels"]

In [None]:
batch["decoder_input_ids"]

In [None]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
!pip install sacrebleu

In [None]:
pip install evaluate

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
pip install transformers[torch]

In [None]:
pip install accelerate -U

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"Gopal-finetuned-custom-en-to-ru",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=200,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [None]:
 from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

In [None]:
trainer.evaluate(max_length=max_length)