In [25]:
from datasets import load_dataset

dataset_train = load_dataset("McAuley-Lab/Amazon-Reviews-2023" , 'raw_review_Software' , split='full[2195000:2200000]')
dataset_val = load_dataset("McAuley-Lab/Amazon-Reviews-2023" , 'raw_review_Software' , split='full[599900:600000]')
dataset_test = load_dataset("McAuley-Lab/Amazon-Reviews-2023" , 'raw_review_Software' , split='full[699900:700000]')
dataset_test

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 100
})

In [26]:
dataset_test[50]

{'rating': 5.0,
 'title': 'Great',
 'text': 'Great',
 'images': [],
 'asin': 'B06Y66GB9T',
 'parent_asin': 'B06Y66GB9T',
 'user_id': 'AHAOZODJISG3VGEPREVRDVUAGMPA',
 'timestamp': 1559611858977,
 'helpful_vote': 12,
 'verified_purchase': True}

This is a Cleaned Python Dataset Covering 25,000 Instructional Tasks
Overview
The dataset has 4 key features (fields): instruction, input, output, and text.
It's a rich source for Python codes, tasks, and extends into behavioral aspects.

1. Dataset Statistics
* Total Entries: 24,813
* Unique Instructions: 24,580
* Unique Inputs: 3,666
* Unique Outputs: 24,581
* Unique Texts: 24,813
* Average Tokens per example: 508
2. Features
* instruction: The instructional task to be performed / User input
* input: Very short, introductive part of AI response or empty
* output: Python code that accomplishes the task
* text: All fields combined together

In [27]:
import torch
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets_train = dataset_train.map(tokenize_function, batched=True)
tokenized_datasets_val = dataset_val.map(tokenize_function, batched=True)
tokenized_datasets_test = dataset_test.map(tokenize_function, batched=True)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [29]:
tokenized_datasets_test

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [30]:
!pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [31]:
from transformers import AutoModelForMaskedLM
from transformers import Trainer , TrainingArguments , AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from accelerate import Accelerator

acc = Accelerator()
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
model = acc.prepare(model)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
torch.cuda.empty_cache()
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_bert",  # Output directory for saved model and logs
    num_train_epochs=3,     # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training 
    per_device_eval_batch_size=4,   # Batch size for evaluation
    learning_rate=2e-5,            # Learning rate
    warmup_steps=500,              # Number of warmup steps (optional)
    save_strategy="epoch",        # Save checkpoint after each epoch
    evaluation_strategy="epoch",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')


In [None]:
test_results = trainer.evaluate(tokenized_datasets_test) 
print(f"Test Results: {test_results}")

# Extract predictions and labels from the test set
test_predictions = trainer.predict(tokenized_datasets_test).predictions.argmax(axis=-1)
test_labels = tokenized_datasets_test['label']

# Calculate metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions, average='weighted')
# ... calculate other metrics (recall, f1, confusion matrix) ...

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")

In [None]:
from transformers import AutoTokenizer

fine_tuned_model = AutoModelForMaskedLM.from_pretrained('./fine_tuned_bert')
fine_tuned_tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_bert')

prompt = "Create a to do list"

input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')

output = fine_tuned_model.generate(
    input_ids, 
    max_length=100,
    num_return_sequences=1,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)
