@misc{alpaca,
  author = {Rohan Taori and Ishaan Gulrajani and Tianyi Zhang and Yann Dubois and Xuechen Li and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
  title = {Stanford Alpaca: An Instruction-following LLaMA model},
  year = {2023},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/tatsu-lab/stanford_alpaca}},
}


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset, DatasetDict
from transformers import pipeline

# Model configuration
model_name = "NousResearch/Llama-2-7b-chat-hf"
token = "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"  

# Load model and tokenizer without quantization
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=token)


# Load and prepare the dataset
full_dataset = load_dataset("yahma/alpaca-cleaned")
split_datasets = full_dataset['train'].train_test_split(test_size=0.2)  # Splitting 20% for testing
train_val_split = split_datasets['train'].train_test_split(test_size=0.1)  # Further splitting the training set for validation

# Wrap splits in a DatasetDict for convenience
dataset_splits = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': split_datasets['test']
})

# Print the column names of the dataset
print(full_dataset.column_names)

# Tokenize the datasets
def tokenize_function(examples):
    concatenated_texts = [instr + " [SEP] " + inp for instr, inp in zip(examples["instruction"], examples["input"])]
    return tokenizer(concatenated_texts, padding="max_length", truncation=True)

tokenized_datasets = dataset_splits.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",          
    num_train_epochs=3,              
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=8,    
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir="./logs",            
    logging_steps=10,
    evaluation_strategy="epoch",     
    save_strategy="epoch",
    load_best_model_at_end=True,     
)

# Initialize the Trainer with training and validation datasets
trainer = Trainer(
    model=model,                               
    args=training_args,                        
    train_dataset=tokenized_datasets['train'],               
    eval_dataset=tokenized_datasets['validation'],                 
)

# Training and validation
trainer.train()

# Evaluate the model using the Trainer on the test set
test_results = trainer.predict(tokenized_datasets['test'])
print("Test Results:", test_results.metrics)

# Save the model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

from transformers import pipeline

generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Now generate predictions as before
test_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in tokenized_datasets["test"]["input_ids"][:20]]  # Assuming you want to generate for the first 20 examples for demonstration

predictions = [
    generation_pipeline(
        text,
        max_length=50,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.9
    )[0]['generated_text'] for text in test_texts
]

# Printing a few predictions for demonstration
for i, prediction in enumerate(predictions[:5]):
    print(f"Prediction {i+1}: {prediction}\n")





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



{'train': ['input', 'output', 'instruction']}


Map:   0%|          | 0/37267 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4141 [00:00<?, ? examples/s]

Map:   0%|          | 0/10352 [00:00<?, ? examples/s]

AttributeError: module 'wandb.proto.wandb_internal_pb2' has no attribute 'Result'