In [1]:
!nvidia-smi
!pip install -Uqq datasets transformers[torch] conllu tabulate

Wed Jul 17 20:57:46 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.52.01              Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...    On  |   00000000:01:00.0  On |                  N/A |
| N/A   56C    P8             21W /  112W |    1068MiB /   8192MiB |     55%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")



In [3]:
from datasets import Dataset
import conllu

source_lang = "ar"
target_lang = "en"

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def generate_dataset():
    with open('dev.conllu', 'r') as file:
        corpus = conllu.parse(file.read())
    
    def token_depth(sent, t):
        if t['head'] == 0:
            return 0
        return 1 + token_depth(sent, sent.filter(id=t['head'])[0])

    def sentence_syntax(sent):
        res = []
        for t in sent:
            if t['upos'] == '_': continue
            parent_dist = t['head'] - t['id']
            # new_depth = token_depth(sent, t)
            # if new_depth > depth:
            #     for _ in range((new_depth - depth)): res.append('[')
            # if new_depth < depth:
            #     for _ in range((depth - new_depth)): res.append(']')
            res.append(f"{t['upos']}{t['deprel']}{token_depth(sent, t)}{'!' if parent_dist > 0 else ''}")
            # depth = new_depth
        return ' '.join(res)

    for sent in corpus:
        yield {
            'translation': {
                "ar": sent.metadata['text'],
                "en": sentence_syntax(sent)
            }
        }

ds = Dataset.from_generator(generate_dataset).train_test_split()
tokenized_ds = ds.map(preprocess_function, batched=True)
tokenized_ds['train'][0]['translation']

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1489 [00:00<?, ? examples/s]



Map:   0%|          | 0/497 [00:00<?, ? examples/s]

{'ar': 'وخلاصة الخبر ان القرار اللبناني استعاد أنفاسه على ما يبدو ، واثر الدوّيخة التي ادخلته في متاهتها فتّيشة رئيس المجلس .',
 'en': 'CCONJcc1! NOUNroot0 NOUNnmod:poss1 SCONJmark2! NOUNnmod1 ADJamod2 VERBccomp2 NOUNobj3 PRONnmod:poss4 ADPcase4! PRONiobj3 VERBxcomp4 PUNCTpunct3 CCONJcc4! ADVadvmod3 NOUNnmod:poss4 PRONnmod5 VERBccomp6 PRONobj7 ADPcase8! NOUNiobj7 PRONnmod:poss8 NOUNnsubj7 NOUNnmod:poss8 NOUNnmod:poss9 PUNCTpunct1'}

In [4]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch", # epoch
    learning_rate=2e-4, # 2e-5
    per_device_train_batch_size=32, # 16
    per_device_eval_batch_size=32, # 16
    weight_decay=0.01, # 0.01
    save_total_limit=3, # 3
    num_train_epochs=9, # 3
    fp16=True, # True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.608217
2,No log,0.385457
3,No log,0.25429
4,No log,0.196992
5,No log,0.167899
6,No log,0.165825
7,No log,0.149886
8,No log,0.145166
9,No log,0.144722


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=423, training_loss=0.3534859370959848, metrics={'train_runtime': 88.3794, 'train_samples_per_second': 151.63, 'train_steps_per_second': 4.786, 'total_flos': 395285390622720.0, 'train_loss': 0.3534859370959848, 'epoch': 9.0})

In [5]:
for txt in ['و نادى الرئيس بضرورة الاتحاد', 'الحمد لله رب العالمين. الرحمن الرحيم. مالك يوم الدين']:
    pos = tokenizer.decode(
        model.generate(
            tokenizer(txt, return_tensors="pt").input_ids.cuda(),
            max_new_tokens=128, do_sample=True, top_k=30, top_p=0.95).flatten(),
        skip_special_tokens=True)
    print(pos)

CCONJcc1! VERBroot0 NOUNnsubj1 ADPcase2! NOUNobj1 NOUNnmod:poss2
NOUNnsubj1! ADPcase3! PROPNobj2 NOUNnmod2 ADJamod3 PUNCTpunct1! PROPNroot0 PROPNflat1 PUNCTpunct1 NOUNnmod1 NOUNnmod:poss2 NOUNnmod:poss3
