In [1]:
!pip install evaluate
!pip install sacrebleu
!pip install transformers[torch]
!pip install datasets==2.13.2

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.8.2

In [2]:
#from kaggle_secrets import UserSecretsClient
import wandb
#user_secrets = UserSecretsClient()

#my_secret = user_secrets.get_secret("wandb_api_key") 
#wandb.login(key=my_secret)
#wandb.init(project = 'huggingface', resume=False)
wandb.init(mode='disabled')



In [3]:
# data loading
import pandas as pd
import numpy as np
import random

df_eng_pl = pd.read_csv('../input/pl-eng-sentences/eng-pl.tsv', sep='\t', header=None, usecols=[1,3], names=['sentence', 'translation'])
df_pl_eng = pd.read_csv('../input/pl-eng-sentences/pl-eng.tsv', sep='\t', header=None, usecols=[1,3])

eng_pl = df_eng_pl.values.tolist()
pl_eng = df_pl_eng.values.tolist()

eng_pl = [['translate English to Polish: ' + sentence[0], sentence[1]] for sentence in eng_pl]
pl_eng = [['translate Polish to English: ' + sentence[0], sentence[1]] for sentence in pl_eng]

data = eng_pl + pl_eng
data = data[:1000] # part of dataset you can use in testing
data_sentences = [x[0] for x in data]
data_translation = [x[1] for x in data]

len(data)

1000

In [4]:
# removing the longest samples from data,
sentence_length = np.zeros((len(data),2))

i = 0
for sample in data:
    sentence_length[i,0] = len(sample[0])
    sentence_length[i,1] = len(sample[1])
    i += 1
    
lengths = sentence_length.flatten()
max_length = np.percentile(lengths, 97)
print(f'before: {len(data)}')

data = [x for x in data if len(x[0]) < round(max_length*0.9,0) or len(x[1]) < max_length]

print(f'after: {len(data)}, max_length set to: {max_length}')

before: 1000
after: 984, max_length set to: 100.0


In [5]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, GenerationConfig
from transformers import PreTrainedTokenizerFast


def get_training_corpus():
    for sample in data:
        yield sample

training_tokenizer = get_training_corpus()

# if loading first time from huggingface
old_tokenizer = AutoTokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
# 32128 is default vocab size of t5-small model, if different size is passed to tokenizer it has to be changed
tokenizer = old_tokenizer.train_new_from_iterator(training_tokenizer, 50000)                                         
model.resize_token_embeddings(len(tokenizer))

# if loading previously trained model
#old_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/t5-small-tokenizer')                                           
#model = T5ForConditionalGeneration.from_pretrained('t5-small-e20', return_dict=True)

# specifying max length of encoded and decoded sentences
max_source_length = int(round(max_length*0.9,0))
max_target_length = int(max_length)


encoding = tokenizer(data_sentences, truncation=True, max_length=max_source_length)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

target_encoding = tokenizer(data_translation, truncation=True, max_length=max_target_length)
labels = target_encoding.input_ids

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]





In [6]:
# In order to send data to training and evaluation using trainer function from huggingface, 
# it has to be in appropriate format. It is the simplest way to use their dataset class
from datasets import Dataset, DatasetDict, Features, Array2D

ds = Dataset.from_dict({"labels": labels,
                        'input_ids': input_ids,
                        'attention_mask': attention_mask})
ds = ds.with_format("torch")
train_test = ds.train_test_split(test_size=0.2, seed=42)
test_val = train_test['test'].train_test_split(test_size=0.8, seed=42)
ds_splits= DatasetDict({
    'train': train_test['train'],
    'valid': test_val['train'],
    'test': test_val['test'],
})
ds_splits.with_format('torch')

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 160
    })
})

In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

# bleu metrics to evaluation
bleu = evaluate.load('sacrebleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(f'Examples: {decoded_preds[:10]}')
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return result

# training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir = '../temp/test_trainer',
    report_to = None, # 'wandb'
    evaluation_strategy = 'epoch',
    learning_rate=0.0004,
    num_train_epochs = 1,
    remove_unused_columns=False,
    eval_accumulation_steps = 256,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    predict_with_generate=True,
    generation_max_length = max_target_length,
    generation_config = model.generation_config,
    )

# data collator are responsible for splitting and adjusting batch sizes
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='longest',
    return_tensors='pt'
    )

# trainer parameters
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_splits['train'],
    eval_dataset=ds_splits['valid'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

2024-03-13 18:55:48.438409: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 18:55:48.438513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 18:55:48.562389: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Score,Counts,Totals,Precisions,Bp,Sys Len,Ref Len
1,No log,7.040511,0.0,"[5, 0, 0, 0]","[5, 1, 0, 0]","[100.0, 50.0, 0.0, 0.0]",0.0,5,277


Trainer is attempting to log a value of "[5, 0, 0, 0]" of type <class 'list'> for key "eval/counts" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[5, 1, 0, 0]" of type <class 'list'> for key "eval/totals" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[100.0, 50.0, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Examples: ['', '', '', '', '', '', '', '.', '', '']


TrainOutput(global_step=13, training_loss=7.553134624774639, metrics={'train_runtime': 5.4109, 'train_samples_per_second': 147.849, 'train_steps_per_second': 2.403, 'total_flos': 9685397667840.0, 'train_loss': 7.553134624774639, 'epoch': 1.0})

In [9]:
#trainer.save_model('/kaggle/working/30etrain')

In [10]:
#tokenizer.save_pretrained('/kaggle/working/tokenizer')

In [11]:
#wandb.finish()