<a href="https://colab.research.google.com/github/Latesh-31/PROGIDY_WD_01/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers torch datasets accelerate wandb


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    pipeline
)
from datasets import Dataset
import pandas as pd
import json
from torch.utils.data import DataLoader
import numpy as np

class GPT2FineTuner:
    def __init__(self, model_name="gpt2", max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def prepare_dataset(self, texts):
        """
        Prepare dataset for training
        texts: list of strings containing your training data
        """
        dataset = Dataset.from_dict({"text": texts})

        def tokenize_function(examples):
            tokenized = self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )
            tokenized["labels"] = tokenized["input_ids"].clone()
            return tokenized

        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names
        )

        return tokenized_dataset

    def fine_tune(self, train_dataset, eval_dataset=None, output_dir="./fine-tuned-gpt2"):
        """
        Fine-tune the model
        """
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=100,
            logging_steps=50,
            save_steps=500,
            eval_steps=500,
            eval_strategy="steps" if eval_dataset else "no",
            save_total_limit=2,
            prediction_loss_only=True,
            learning_rate=5e-5,
            weight_decay=0.01,
            logging_dir='./logs',
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )

        trainer.train()

        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)

        return trainer

    def generate_text(self, prompt, max_length=100, num_return_sequences=1, temperature=0.8):
        """
        Generate text using the fine-tuned model
        """
        generator = pipeline(
            'text-generation',
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )

        generated = generator(
            prompt,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )

        return generated

def main():
    training_texts = [
        "The future of artificial intelligence is bright and full of possibilities.",
        "Machine learning algorithms continue to evolve and improve daily.",
        "Natural language processing has revolutionized how we interact with computers.",
        "Deep learning models can now understand context and generate human-like text.",
        "The transformer architecture has been a breakthrough in AI research.",
    ]

    fine_tuner = GPT2FineTuner(model_name="gpt2")

    train_dataset = fine_tuner.prepare_dataset(training_texts)

    train_size = int(0.9 * len(train_dataset))
    eval_size = len(train_dataset) - train_size
    train_dataset, eval_dataset = torch.utils.data.random_split(
        train_dataset, [train_size, eval_size]
    )

    trainer = fine_tuner.fine_tune(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        output_dir="./my-fine-tuned-gpt2"
    )

    prompt = "The future of AI"
    generated_texts = fine_tuner.generate_text(
        prompt=prompt,
        max_length=150,
        num_return_sequences=3,
        temperature=0.8
    )

    print(f"Prompt: {prompt}")
    print("Generated texts:")
    for i, text in enumerate(generated_texts):
        print(f"\n{i+1}. {text['generated_text']}")

if __name__ == "__main__":
    main()

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlatesh312006[0m ([33mlatesh312006-sri-shakthi-institute-of-engineering-and-te[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Prompt: The future of AI
Generated texts:

1. The future of AI is an open question, but I don't think there's much question about what it would be like if not, when it doesn't just has to meet the needs of the human being. It is a lot of a deal, and it is a huge investment to help, and on top of, it would be amazing and I think it is a real thing and I think people and governments and people in their way of the way in their way of doing things. I think this is a big deal for the world and I think it would be really interesting and I think if there is a chance to get a solution and get it to work and then it would be a huge deal for people to go with it and that's great.

Q: What does it mean to the world to get such a big deal?

A: It means this is a huge deal for the world. I think it is absolutely the great thing to be able to get this big world together and to have a big deal to have a deal of this great deal. I think I've never been involved in this and I think that's really exciti

In [6]:
class AdvancedGPT2FineTuner(GPT2FineTuner):
    def __init__(self, model_name="gpt2", max_length=512, use_gradient_checkpointing=True):
        super().__init__(model_name, max_length)

        # Enable gradient checkpointing to save memory
        if use_gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

    def create_training_args(self, output_dir, epochs=3, batch_size=4, learning_rate=5e-5):
        """
        Create customized training arguments
        """
        return TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=2,
            warmup_steps=100,
            logging_steps=10,
            save_steps=500,
            eval_steps=500,
            evaluation_strategy="steps",
            save_total_limit=2,
            learning_rate=learning_rate,
            weight_decay=0.01,
            fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA available
            dataloader_pin_memory=True,
            remove_unused_columns=False,
            report_to="wandb",  # For experiment tracking
        )
