In [1]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset, Dataset
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

In [3]:
# Load your dataset
df = pd.read_csv('./data/eng-kir.csv', delimiter='\t', header=None)  # Adjust this for your file format
# dataset = Dataset.from_pandas(data)


In [4]:
source_lang = "eng"  # Your source language code
target_lang = "kir"  # Your target language code

In [5]:
model_name = "facebook/m2m100_418M"  # You can choose a different size if needed
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [6]:
available_languages = tokenizer.lang_code_to_id.keys()
print("Available languages:", list(available_languages))

Available languages: ['af', 'am', 'ar', 'ast', 'az', 'ba', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'ceb', 'cs', 'cy', 'da', 'de', 'el', 'en', 'es', 'et', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'ilo', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'lb', 'lg', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'ne', 'nl', 'no', 'ns', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sd', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'th', 'tl', 'tn', 'tr', 'uk', 'ur', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh', 'zu']


In [7]:
import tqdm

In [10]:
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm

english_sentences = df[0].tolist()  # Assuming English sentences are in the first column
kirundi_sentences = df[1].tolist()  # Assuming Kirundi sentences are in the second column

# Tokenize sentences and identify new tokens
all_sentences = english_sentences + kirundi_sentences
new_tokens = set()

filtered_sentences = []
for eng, kir in zip(english_sentences, kirundi_sentences):
    if len(tokenizer.tokenize(eng)) <= 10:
        filtered_sentences.append(eng)
        filtered_sentences.append(kir)

In [15]:
len(tokenizer.get_vocab())

128104

In [13]:
new_tokens = set()
for sentence in tqdm(filtered_sentences, desc="Tokenizing sentences"):
    tokens = tokenizer.tokenize(sentence)
    new_tokens.update(tokens)


# Convert set to list
new_tokens = list(new_tokens)

# Add new tokens to the tokenizer
if new_tokens:
    tokenizer.add_tokens(new_tokens)
    print(f"Added {len(new_tokens)} new tokens.")

# Save the updated tokenizer
tokenizer.save_pretrained('./m2m100_updated_tokenizer')

print("Tokenizer updated and saved successfully.")

Tokenizing sentences: 100%|██████████| 954/954 [00:00<00:00, 24227.36it/s]


Added 696 new tokens.
Tokenizer updated and saved successfully.


In [14]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_418M")
model.resize_token_embeddings(len(tokenizer))

M2M100ScaledWordEmbedding(128104, 1024, padding_idx=1)

In [16]:
def check_token_length(sentence):
    tokens = tokenizer.tokenize(sentence)
    return len(tokens) <= 10

# Apply the token length check and filter the DataFrame
filtered_df = df[df.apply(lambda x: check_token_length(x[0]), axis=1)]


In [17]:
data = Dataset.from_pandas(filtered_df)
data

Dataset({
    features: ['0', '1', '__index_level_0__'],
    num_rows: 96
})

In [20]:
print(data.column_names)  # Check the actual column names
print(data[0])  

['0', '1', '__index_level_0__']
{'0': 'Adam, Seth, Enosh,', '1': 'Adamu na Seti na Enoshi,', '__index_level_0__': 0}


In [18]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["0"], max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["1"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/96 [00:00<?, ? examples/s]


KeyError: None

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer
)

In [None]:
from datasets import load_metric
import numpy as np

bleu = load_metric("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return bleu.compute(predictions=decoded_preds, references=decoded_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [39]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir='./huggingface_results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # your train dataset
    eval_dataset=eval_dataset,    # your eval dataset
    data_collator=data_collator,
    tokenizer=tokenizer
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
trainer.train()
model.save_pretrained('./trained_m2m100')

In [26]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")

src_text = "Life is like a box of chocolates."
tgt_text = "La vie est comme une boîte de chocolat."

model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")

loss = model(**model_inputs).loss  # forward pass

KeyError: None

In [25]:
df[0].tolist()[0]

'Adam, Seth, Enosh,'