<a href="https://colab.research.google.com/github/NAGADEEP92/AI-ML-DS/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip3 install transformers==4.25.1 datasets==2.4.0 nvidia-ml-py3==7.352.0 torchvision

In [None]:
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)
import torch.nn.functional as F
import torch
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
    nvmlDeviceGetName,
)
import random
import numpy as np
from PIL import Image
import os
from tqdm import tqdm

# Managing Large Datasets

## Generate Sample Image Data

In [None]:
# Function to generate and save synthetic images
def create_synthetic_images(num_images, image_size, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for i in tqdm(range(num_images), total=num_images):
        # Generate a random image (using random pixel values)
        image = np.random.randint(0, 256, (image_size, image_size, 3), dtype=np.uint8)

        # Convert to a PIL image and save
        pil_image = Image.fromarray(image)
        pil_image.save(os.path.join(save_dir, f'image_{i}.png'))

# Parameters
num_images = 100  # Number of synthetic images to create
image_size = 224  # Size of each image (224x224)
save_dir = 'synthetic_images'  # Directory to save images

# Create synthetic images
create_synthetic_images(num_images, image_size, save_dir)

100%|██████████| 100/100 [00:01<00:00, 79.04it/s]


## Naive Approach

In [None]:
image_paths = [os.path.join(save_dir, f) for f in os.listdir(save_dir)]  # List of image paths

all_images = []
for f in tqdm(image_paths):
    image = Image.open(f)
    all_images.append(np.array(image.getdata()))

100%|██████████| 100/100 [00:03<00:00, 25.71it/s]


## Using a DataLoader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import os

class CustomImageDataset(Dataset):
    def __init__(self, image_paths):
        self.image_paths = image_paths

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        return np.array(image.getdata())


dataset = CustomImageDataset(image_paths)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
for data in dataloader:
    print(data)

tensor([[[  4, 210, 216],
         [189, 134, 123],
         [185,  20, 201],
         ...,
         [161, 110,  88],
         [178, 184, 187],
         [ 49, 137, 242]],

        [[104, 195,  78],
         [229, 186, 159],
         [119,  96, 130],
         ...,
         [204, 100,  80],
         [ 10, 242, 128],
         [ 10, 106,  94]],

        [[ 48,  15, 126],
         [176, 153, 183],
         [170, 227, 157],
         ...,
         [ 54,  81, 209],
         [ 36, 170, 168],
         [ 45, 203, 219]],

        ...,

        [[107,   1,  20],
         [  2, 166,  84],
         [103,  71, 159],
         ...,
         [249, 247, 163],
         [141, 136,  21],
         [ 62,  64,  34]],

        [[130,  56, 209],
         [139, 202,   4],
         [230, 255, 134],
         ...,
         [ 92, 220,  43],
         [134, 178,  71],
         [170, 183, 185]],

        [[234,   1, 165],
         [208, 161,  73],
         [246, 188, 226],
         ...,
         [ 75,  43, 232],
        

## Compare sizes

In [None]:
import sys

def sizeof_fmt(variable, suffix="B"):
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    num = sys.getsizeof(variable)
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)


print('Size of all images:', sizeof_fmt(all_images))
print('Size of DataLoader:', sizeof_fmt(dataloader))

Size of all images: 920.0 B
Size of DataLoader: 48.0 B


# GPU Benchmarking Utilities

In [None]:
def print_gpu_utilization():
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        info = nvmlDeviceGetMemoryInfo(handle)
        print("Device", i, ":", nvmlDeviceGetName(handle))
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    torch.cuda.empty_cache()


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
## set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

## measure GPU utilization
if torch.cuda.is_available():
    print_gpu_utilization()
    torch.ones((1, 1)).to(DEVICE)
    print_gpu_utilization()

Using device cuda
Device 0 : b'Tesla T4'
GPU memory occupied: 261 MB.
Device 0 : b'Tesla T4'
GPU memory occupied: 893 MB.


# Load dataset and Model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "microsoft/DialogRPT-updown", model_max_length=1024, cache_dir="./cache/"
)
model = GPT2ForSequenceClassification.from_pretrained(
    "microsoft/DialogRPT-updown",
    num_labels=2,
    ignore_mismatched_sizes=True,
    cache_dir="./cache/",
).to(DEVICE)
model.config.use_cache = False

if torch.cuda.is_available():
    print_gpu_utilization()

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialogRPT-updown and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([1, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device 0 : b'Tesla T4'
GPU memory occupied: 2271 MB.


In [None]:
split = ["train[:25]", "test[:25]"]
raw_train, raw_test = load_dataset(
    "imdb",
    split=split,
    cache_dir="./cache/",
)

print(len(raw_train))
print(raw_train[2]["text"])
print(raw_train[2]["label"])
print(tokenizer.encode(raw_train[2]["text"]))

train = raw_train.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
    batched=True,
)
test = raw_test.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
    batched=True,
)

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to ./cache/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to ./cache/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

25
If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />
0
[1532, 691, 284, 3368, 1642, 428, 2099, 286, 2646, 287, 262, 2003, 13, 770, 2646, 318, 3499, 355, 281, 6306, 475, 4952, 645, 43072, 298, 1621, 29847, 1671, 1220, 6927, 1671, 11037, 3198, 1244, 1254, 41276, 329, 5586, 33834, 340, 780, 340, 18105, 319, 523, 867, 30023, 9863, 8643, 2428, 475, 340, 857, 523, 1231, 597, 22024, 540, 20289, 13, 383, 19091, 2058, 1497, 351, 645, 649, 22582, 357, 25252, 530, 2058, 510, 351, 530, 981, 530, 338, 2000, 11569, 364, 11, 355, 

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Normal Training

In [None]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
}

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
print_summary(result)

The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 25
  Number of trainable parameters = 354825216


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 23.99
Samples/second: 1.04
Device 0 : b'Tesla T4'
GPU memory occupied: 11821 MB.


# Gradient Accumulation

In [None]:
default_args["gradient_accumulation_steps"] = 4

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
print_summary(result)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 6
  Number of trainable parameters = 354825216


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 18.92
Samples/second: 1.32
Device 0 : b'Tesla T4'
GPU memory occupied: 12963 MB.


# Gradient Checkpointing

In [None]:
default_args["gradient_checkpointing"] = True

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
print_summary(result)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 6
  Number of trainable parameters = 354825216


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 24.90
Samples/second: 1.00
Device 0 : b'Tesla T4'
GPU memory occupied: 7311 MB.


# Mixed Precision

In [None]:
default_args["fp16"] = True

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
print_summary(result)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 6
  Number of trainable parameters = 354825216


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 16.66
Samples/second: 1.50
Device 0 : b'Tesla T4'
GPU memory occupied: 7729 MB.
