In [1]:
import torch
# import torch_directml

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm
2024-08-02 01:47:31.949026: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-02 01:47:33.828087: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-08-02 01:47:33.828163: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-08-02 01:47:33.843459: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
2024-08-02 01:47:35.257348: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [3]:
# Activate DirectML for Windows Subsystem Linux
dml = torch_directml.device()
tensor1 = torch.tensor([1]).to(dml) # Note that dml is a variable, not a string!
tensor2 = torch.tensor([2]).to(dml)
dml_algebra = tensor1 + tensor2
dml_algebra.item()

NameError: name 'torch_directml' is not defined

In [4]:
# Load dataset
dataset = load_dataset('csv', data_files='./domain_vocab/large_vocab_refined.txt')

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['domain', 'general'],
        num_rows: 8563
    })
})

In [6]:
# Load tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
# Load model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')



In [7]:
def preprocess_data(data):
    inputs = data['general']
    targets = data['domain']
    model_inputs = tokenizer(inputs, max_length=10, truncation=True, padding='max_length')
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=4, truncation=True,  padding='max_length')
    
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] 
        for labels_example in labels["input_ids"]
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_data, batched=True,  remove_columns=["general", "domain"])

In [9]:
train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

In [12]:
validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 857
})

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=0.01,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)

In [11]:
# Fine-tune the model with large dataset
trainer.train()

***** Running training *****
  Num examples = 7706
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 1443
  Number of trainable parameters = 406291456


: 

In [15]:
# Fine-tune the model with small dataset
trainer.train()

***** Running training *****
  Num examples = 324
  Num Epochs = 3
  Instantaneous batch size per device = 2


  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 486
  Number of trainable parameters = 406291456


Epoch,Training Loss,Validation Loss
1,No log,1.104275
2,No log,1.016484
3,No log,0.940805


***** Running Evaluation *****
  Num examples = 324
  Batch size = 2
***** Running Evaluation *****
  Num examples = 324
  Batch size = 2
***** Running Evaluation *****
  Num examples = 324
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=486, training_loss=0.8041648707762667, metrics={'train_runtime': 1165.2983, 'train_samples_per_second': 0.834, 'train_steps_per_second': 0.417, 'total_flos': 24168295464960.0, 'train_loss': 0.8041648707762667, 'epoch': 3.0})

In [16]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

***** Running Evaluation *****
  Num examples = 324
  Batch size = 2


{'eval_loss': 0.9408054947853088, 'eval_runtime': 33.8424, 'eval_samples_per_second': 9.574, 'eval_steps_per_second': 4.787, 'epoch': 3.0}


In [17]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-bart')
tokenizer.save_pretrained('./fine-tuned-bart')

Configuration saved in ./fine-tuned-bart/config.json


Model weights saved in ./fine-tuned-bart/pytorch_model.bin
tokenizer config file saved in ./fine-tuned-bart/tokenizer_config.json
Special tokens file saved in ./fine-tuned-bart/special_tokens_map.json


('./fine-tuned-bart/tokenizer_config.json',
 './fine-tuned-bart/special_tokens_map.json',
 './fine-tuned-bart/vocab.json',
 './fine-tuned-bart/merges.txt',
 './fine-tuned-bart/added_tokens.json')

In [4]:
# Load the fine-tuned model and tokenizer
ft_model = BartForConditionalGeneration.from_pretrained('../../../../../../pMage_AI_model/fine-tuned-bart')
tokenizer = BartTokenizer.from_pretrained('../../../../../../pMage_AI_model/fine-tuned-bart')

### Test generating summary of the task name

In [7]:
def summarize_phrase(ft_model, phrase, min_len=4, max_len=10):
    # Tokenize the input phrase
    inputs = tokenizer(phrase, max_length=150, truncation=True, return_tensors="pt")

    summary_ids = ft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_len,
        min_length=min_len,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [8]:
input_phrase = "Assemble the system into a prototype vehicle that we can verify if the brake system can work properly via some specific assessments carried out by another team"

In [None]:
summary = summarize_phrase(ft_model, input_phrase, 10, 10)
print(summary)

In [8]:
tasks = ["Design brake system blueprint", "Implement Remote Sensing Devices braking activation", "Connect Remote Sensing Devices with brake control system", "Assemble the system into a prototype vehicle", "Assemble the system into the final version"]

In [16]:
for task in tasks:
    summary = summarize_phrase(ft_model, task, 10, 10)
    print(summary)

brakeke system design
brakebrake activation
brakebrake Vehicle
autom/brake
technician.Assemble
