In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
data = {
    "source": [
        "Machine learning enables computers to learn from data.",
        "Natural language processing involves the interaction between computers and human language."
    ],
    "summary": [
        "Computers learn from data using machine learning.",
        "NLP is the interaction between computers and human language."
    ]
}

df = pd.DataFrame(data)
df.to_csv("domain_data.csv", index=False)

In [22]:
# Load dataset
dataset2 = load_dataset('csv', data_files='domain_data.csv')

Using custom data configuration default-e2e765e962961544


Downloading and preparing dataset csv/default to C:\Users\iohkg\.cache\huggingface\datasets\csv\default-e2e765e962961544\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 74.82it/s]
  csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.read_csv_kwargs)


Dataset csv downloaded and prepared to C:\Users\iohkg\.cache\huggingface\datasets\csv\default-e2e765e962961544\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 250.63it/s]


In [23]:
dataset2

DatasetDict({
    train: Dataset({
        features: ['source', 'summary'],
        num_rows: 2
    })
})

In [2]:
# Load dataset
dataset = load_dataset('csv', data_files='small_vocab.txt')

Using custom data configuration default-e529f6e7d863e59f
Reusing dataset csv (C:\Users\iohkg\.cache\huggingface\datasets\csv\default-e529f6e7d863e59f\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)
100%|██████████| 1/1 [00:00<00:00, 59.12it/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['domain', 'general'],
        num_rows: 75
    })
})

In [4]:
# Load tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
# Load model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

In [17]:
def preprocess_data(data):
    inputs = data['general']
    targets = data['domain']
    model_inputs = tokenizer(inputs, max_length=10, truncation=True, padding='max_length')
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=4, truncation=True,  padding='max_length')
    
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] 
        for labels_example in labels["input_ids"]
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [24]:
tokenized_dataset = dataset.map(preprocess_data, batched=True,  remove_columns=["general", "domain"])

100%|██████████| 1/1 [00:00<00:00, 14.18ba/s]


In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 75
    })
})

In [26]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['train'],
)



In [27]:
# Fine-tune the model
trainer.train()

 33%|███▎      | 38/114 [08:38<14:43, 11.63s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                
 33%|███▎      | 38/114 [09:07<14:43, 11.63s/it]
[A

{'eval_loss': 1.5409327745437622, 'eval_runtime': 29.0345, 'eval_samples_per_second': 2.583, 'eval_steps_per_second': 1.309, 'epoch': 1.0}


 67%|██████▋   | 76/114 [16:14<06:55, 10.94s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                
 67%|██████▋   | 76/114 [16:46<06:55, 10.94s/it]
[A

{'eval_loss': 0.7686346173286438, 'eval_runtime': 31.8061, 'eval_samples_per_second': 2.358, 'eval_steps_per_second': 1.195, 'epoch': 2.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
100%|██████████| 114/114 [25:01<00:00, 10.98s/it]
[A
100%|██████████| 114/114 [25:01<00:00, 13.17s/it]

{'eval_loss': 0.7108722925186157, 'eval_runtime': 34.2373, 'eval_samples_per_second': 2.191, 'eval_steps_per_second': 1.11, 'epoch': 3.0}
{'train_runtime': 1501.1902, 'train_samples_per_second': 0.15, 'train_steps_per_second': 0.076, 'train_loss': 0.7234770122327303, 'epoch': 3.0}





TrainOutput(global_step=114, training_loss=0.7234770122327303, metrics={'train_runtime': 1501.1902, 'train_samples_per_second': 0.15, 'train_steps_per_second': 0.076, 'total_flos': 4761704448000.0, 'train_loss': 0.7234770122327303, 'epoch': 3.0})

In [28]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

100%|██████████| 38/38 [00:46<00:00,  1.21s/it]

{'eval_loss': 0.7108722925186157, 'eval_runtime': 47.1141, 'eval_samples_per_second': 1.592, 'eval_steps_per_second': 0.807, 'epoch': 3.0}





In [29]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-bart')
tokenizer.save_pretrained('./fine-tuned-bart')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine-tuned-bart\\tokenizer_config.json',
 './fine-tuned-bart\\special_tokens_map.json',
 './fine-tuned-bart\\vocab.json',
 './fine-tuned-bart\\merges.txt',
 './fine-tuned-bart\\added_tokens.json')