In [1]:
!nvidia-smi
!pip install -Uqq datasets transformers[torch] conllu tabulate

Sat Jul 13 13:52:25 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.01              Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    On  |   00000000:01:00.0  On |                  N/A |
| N/A   45C    P8             18W /  130W |     888MiB /   8192MiB |     23%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")



In [9]:
from datasets import Dataset
import conllu

source_lang = "ar"
target_lang = "en"

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def generate_dataset():
    with open('dev.conllu', 'r') as file:
        corpus = conllu.parse(file.read())
    
    def token_depth(sent, t):
        if t['head'] == 0:
            return 0
        return 1 + token_depth(sent, sent.filter(id=t['head'])[0])

    def sentence_syntax(sent):
        res = []
        for t in sent:
            if t['upos'] == '_': continue
            depth = token_depth(sent, t)
            # if new_depth > depth:
            #     for _ in range((new_depth - depth)): res.append('[')
            # if new_depth < depth:
            #     for _ in range((depth - new_depth)): res.append(']')
            res.append(f"{t['upos']}{depth}")
            # depth = new_depth
        return ' '.join(res)

    for sent in corpus:
        yield {
            'translation': {
                "ar": sent.metadata['text'],
                "en": sentence_syntax(sent)
            }
        }

ds = Dataset.from_generator(generate_dataset).train_test_split()
tokenized_ds = ds.map(preprocess_function, batched=True)
tokenized_ds['train'][0]['translation']

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1489 [00:00<?, ? examples/s]



Map:   0%|          | 0/497 [00:00<?, ? examples/s]

{'ar': 'ولليوم الثالث استمرت موجة الصقيع ، وبلغت درجات الحرارة في عاليه والمتن اعلى مستوى تعهده المنطقتان منذ وقت طويل .',
 'en': 'CCONJ1 ADP2 NOUN1 ADJ2 VERB0 NOUN1 NOUN2 PUNCT1 CCONJ2 VERB1 NOUN2 NOUN3 ADP4 PROPN3 CCONJ5 PROPN4 ADJ2 NOUN3 VERB3 PRON4 NOUN4 ADP5 NOUN4 ADJ5 PUNCT1'}

In [10]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch", # epoch
    learning_rate=2e-4, # 2e-5
    per_device_train_batch_size=32, # 16
    per_device_eval_batch_size=32, # 16
    # weight_decay=0.01, # 0.01
    save_total_limit=3, # 3
    num_train_epochs=5, # 3
    fp16=True, # True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.001119
2,No log,0.516195
3,No log,0.318864
4,No log,0.243931
5,No log,0.223806


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=235, training_loss=0.9207190330992354, metrics={'train_runtime': 51.7079, 'train_samples_per_second': 143.982, 'train_steps_per_second': 4.545, 'total_flos': 220882540363776.0, 'train_loss': 0.9207190330992354, 'epoch': 5.0})

In [11]:
for txt in ['و نادى الرئيس بضرورة الاتحاد', 'الحمد لله رب العالمين']:
    pos = tokenizer.decode(
        model.generate(
            tokenizer(txt, return_tensors="pt").input_ids.cuda(),
            max_new_tokens=128, do_sample=True, top_k=30, top_p=0.95).flatten(),
        skip_special_tokens=True)
    print(pos)

CCONJ1 VERB0 NOUN1 ADP2 NOUN1 NOUN2
NOUN0 ADP2 PROPN1 NOUN1 NOUN2
