In [2]:
!pip install datasets torch transformers



In [3]:
from datasets import load_dataset

dataset = load_dataset("jamescalam/llama-2-arxiv-papers-chunked" , split='train')
dataset

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [4]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

In [5]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples['chunk'],padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references', 'input_ids', 'attention_mask'],
    num_rows: 4838
})

In [6]:
# Convert tokenized datasets to PyTorch/TensorFlow dataset
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask'])
tokenized_datasets

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references', 'input_ids', 'attention_mask'],
    num_rows: 4838
})

In [7]:
# batch_size = 32

# for i in range(0, len(data), batch_size):
#     i_end = min(len(data), i+batch_size)
#     batch = data.iloc[i:i_end]
#     ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
#     texts = [x['chunk'] for i, x in batch.iterrows()]
#     tokens = tokenizer(texts , truncation=True)
#     #print(tokens)
#     metadata = [
#         {'text': x['chunk'],
#          'source': x['source'],
#          'title': x['title']} for i, x in batch.iterrows()
#     ]
# metadata

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

2024-05-25 11:26:40.645020: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 11:26:40.645083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 11:26:40.646636: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
from transformers import GPT2LMHeadModel , Trainer , TrainingArguments
import torch
from accelerate import Accelerator, DataLoaderConfiguration

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
# model = model.to(device)
model.gradient_checkpointing_enable()

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=8,  # Gradient accumulation
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # Mixed precision training
)

# Setup accelerator with new dataloader configuration
accelerator = Accelerator(
    dataloader_config=DataLoaderConfiguration(
        dispatch_batches=None,
        split_batches=False,
        even_batches=True,
        use_seedable_sampler=True
    )
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['attention_mask'],
#     accelerator=accelerator  # Ensure the trainer uses the accelerator
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33midrissih394[0m ([33mdr2[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


In [None]:
model.save_pretrained('fine_tuned_gpt2')
tokenizer.save_pretrained('fine_tuned_gpt2')


In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2')
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_gpt2')

# Define your prompt
prompt = "Once upon a time in a land far, far away,"

# Encode the prompt
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')

# Generate text
output = fine_tuned_model.generate(
    input_ids, 
    max_length=100,
    num_return_sequences=1,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

# Decode the generated text
generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


OSError: fine_tuned_gpt2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`