<a href="https://colab.research.google.com/github/Mahmood-Anaam/Violet/blob/main/notebooks/features_extraction_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Features Extraction Demo



---



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Violet Install

In [1]:
%cd /content
!git clone https://github.com/Mahmood-Anaam/Violet.git
%cd /content/Violet
!pip install -e . --quiet
import IPython
app = IPython.Application.instance()
_=app.kernel.do_shutdown(True)

/content
Cloning into 'Violet'...
remote: Enumerating objects: 227, done.[K
remote: Counting objects: 100% (227/227), done.[K
remote: Compressing objects: 100% (153/153), done.[K
remote: Total 227 (delta 87), reused 178 (delta 49), pack-reused 0 (from 0)[K
Receiving objects: 100% (227/227), 9.15 MiB | 17.98 MiB/s, done.
Resolving deltas: 100% (87/87), done.
/content/Violet
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.6/473.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Import all libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")

from PIL import Image
import requests
import torch
import numpy as np

from transformers import AutoTokenizer
from violet.modeling import Violet,VisualEncoder, ScaledDotProductAttention
from violet.pipeline import VioletImageCaptioningPipeline
from violet.configuration import VioletConfig

## Configuration



```
dataset_size , num_segments = 5046 , 2
segment_size, remainder = divmod(dataset_size, num_segments)
size_segments = [range(i * segment_size + min(i, remainder), (i + 1) * segment_size + min(i + 1, remainder)) for i in range(num_segments)]
size_segments
```



In [3]:
import torch

class VioletConfig:
  """
  Configuration for Violet Pipeline.
  Contains default parameters that can be used globally in the pipeline.
  """

  # General settings
  CHECKPOINT_DIR = "/content/drive/MyDrive/Violet_checkpoint_0.pth"  # Path to the pretrained model
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Device for computation (GPU/CPU)
  TOKENIZER_NAME = "UBC-NLP/Jasmine-350M"  # Tokenizer model name
  PROCESSOR_NAME = "openai/clip-vit-large-patch14"  # Processor model name

  # Model settings
  ENCODER_LAYERS = 3  # Number of layers in the visual encoder
  DECODER_LAYERS = 12  # Number of layers in the decoder
  TAU = 0.3  # Temperature parameter for the softmax function

  # Generation settings
  MAX_LENGTH = 40  # Maximum length of generated sequences
  BEAM_SIZE = 5  # Beam size for beam search decoding
  OUT_SIZE = 3  # Number of output sequences to generate

  # Dataset settings
  DATASET_PATH = "MahmoodAnaam/ok-vqa-ar-en-2"  # Path or name of the dataset
  LANGUAGE = "ar"  # Language for questions/answers ("ar" for Arabic, "en" for English)
  SPLIT = "validation"  # Dataset split to use ("train", "validation", "test")
  SAVE_SEGMENT_DIR = "/content/drive/MyDrive/OKVQA_Violet_Features_Checkpoint"  # Directory to save extracted features

  SIZE_SEGMENTS = [
      range(0,2523),      # Segment 0
      range(2523, 5046),  # Segment 1
  ]

  CURRENT_INDEX_SEGMENT = 0  # Index of the current segment being processed
  USERNAME = "MahmoodAnaam"  # Username for Hugging Face


## Dataset

In [6]:
!pip install datasets --quiet

In [7]:
from huggingface_hub import login
from google.colab import userdata
from datasets import load_dataset

In [8]:
# Login to Hugging Face
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [9]:
dataset = load_dataset(VioletConfig.DATASET_PATH,split=VioletConfig.SPLIT,trust_remote_code=True)
dataset

README.md:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/373M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

validation-00000-of-00002.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

validation-00001-of-00002.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9009 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5046 [00:00<?, ? examples/s]

Dataset({
    features: ['metadata', 'image', 'question', 'answers'],
    num_rows: 5046
})

In [10]:
print(dataset[0])

{'metadata': {'image_id': 297147, 'question_id': 2971475, 'question_type': 'one', 'answer_type': 'other', 'confidence': 3}, 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x78E4E04F8D30>, 'question': {'en': 'What sport can you use this for?', 'ar': 'في أي رياضة يمكنك استخدام هذا؟'}, 'answers': {'en': ['race', 'race', 'race', 'race', 'race', 'race', 'motocross', 'motocross', 'ride', 'ride'], 'ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'raw_en': ['racing', 'racing', 'racing', 'racing', 'racing', 'racing', 'motocross', 'motocross', 'riding', 'riding'], 'raw_ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'confidence': ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'], 'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}}


## Extract Features

In [11]:
from violet.pipeline import VioletImageCaptioningPipeline

pipeline = VioletImageCaptioningPipeline(VioletConfig)

tokenizer_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.56M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/593M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/593M [00:00<?, ?B/s]

In [13]:
def extract_features(examples):
  try:
    examples["features"] = pipeline.generate_features(examples["image"])
  except Exception as e:
    examples["features"] = [None]*len(examples)
    print(f"Error while extracting features: {e}")
  return examples


# ..................................................

def features_captions(examples):
  try:
    features = pipeline.generate_features(examples["image"])
    captions = pipeline.generate_captions_from_features(features)
    examples["features"] = features
    examples["captions"] = captions
  except Exception as e:
    examples["captions"] = [None] * len(examples)
    examples["features"]  = [None] * len(examples)
    print(f"Error while extracting features: {e}")
  return examples

# ..................................................

rng = VioletConfig.SIZE_SEGMENTS[VioletConfig.CURRENT_INDEX_SEGMENT]


ds  = dataset.select(rng).map(features_captions,
                              batched=True,
                              batch_size=20,
                              )

checkpoint_dir = f"{VioletConfig.SAVE_SEGMENT_DIR}_{min(rng)}_{max(rng)}"
ds.save_to_disk(checkpoint_dir)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [14]:
ds[0].keys()

dict_keys(['metadata', 'image', 'question', 'answers', 'features', 'captions'])

In [15]:
len(ds[0]['features']),len(ds[0]['features'][0][0])

(3, 768)

In [16]:
ds[0]['captions']

[{'caption': ' دراجة نارية سوداء وفضية متوقفة في موقف للسيارات'},
 {'caption': ' دراجة نارية سوداء وفضية متوقفة في الكثير'},
 {'caption': ' دراجة نارية بيضاء وسوداء متوقفة في موقف للسيارات'}]



---



## Huggingface Hub

In [25]:
from datasets import load_from_disk

rng = VioletConfig.SIZE_SEGMENTS[VioletConfig.CURRENT_INDEX_SEGMENT]

checkpoint_dir = f"{VioletConfig.SAVE_SEGMENT_DIR}_{min(rng)}_{max(rng)}"
ds = load_from_disk(checkpoint_dir)
ds

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'features', 'captions'],
    num_rows: 4
})

In [26]:
from huggingface_hub import Repository, create_repo

# Create a repo on Hugging Face Hub
username = VioletConfig.USERNAME
repo_name = checkpoint_dir.split("/")[-1]

repo_id = f"{username}/{repo_name}"
repo_url = create_repo(repo_id=repo_id, private=True, exist_ok=True,repo_type="dataset")
print(f"Dataset repository URL: {repo_url}")

Dataset repository URL: https://huggingface.co/datasets/MahmoodAnaam/OKVQA_Violet_Features_Checkpoint_0_3


In [27]:
# Push the dataset to the repo
ds.push_to_hub(repo_id=repo_id)
print(f"Dataset pushed to {repo_id}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to MahmoodAnaam/OKVQA_Violet_Features_Checkpoint_0_3


In [28]:
from datasets import load_dataset

ds = load_dataset(repo_id,split=VioletConfig.SPLIT)
ds

README.md:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/261k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/4 [00:00<?, ? examples/s]

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'features', 'captions'],
    num_rows: 4
})

In [29]:
ds['captions']

[[{'caption': ' دراجة نارية سوداء وفضية متوقفة في موقف للسيارات'},
  {'caption': ' دراجة نارية سوداء وفضية متوقفة في الكثير'},
  {'caption': ' دراجة نارية بيضاء وسوداء متوقفة في موقف للسيارات'}],
 [{'caption': ' حمام مع حوض غسيل ومرحاض'},
  {'caption': ' حمام مع حوض غسيل ومرايا'},
  {'caption': ' حمام مع حوض غسيل ومرايا كبيرة'}],
 [{'caption': ' مجموعة من الناس يجلسون معاً'},
  {'caption': ' رجل يحمل دبّة دمية في حضنه'},
  {'caption': ' رجل يحمل دبّة دمية بينما يجلس على مقعد'}],
 [{'caption': ' رجل يحمل كلباً أبيض وأسود'},
  {'caption': ' كلب أبيض وأسود يقف على ظهر رجل'},
  {'caption': ' رجل يحمل فريزبي أبيض فوق كلب أبيض وأسود'}]]



---



## Merge Dataset Segments

In [None]:
from datasets import load_dataset

username = VioletConfig.USERNAME
base_name = VioletConfig.SAVE_SEGMENT_DIR.split('/')[-1]

repos_names = [ f"{username}/{base_name}_{min(rng)}_{max(rng)}" for rng in VioletConfig.SIZE_SEGMENTS]

datasets = [load_dataset(repo_id,split=VioletConfig.SPLIT) for repo_id in repos_names]
datasets


[Dataset({
     features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
     num_rows: 4
 })]

In [None]:
from datasets import concatenate_datasets

merged_dataset = concatenate_datasets(datasets,split=VioletConfig.SPLIT)
merged_dataset

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
    num_rows: 4
})

In [None]:
from huggingface_hub import Repository, create_repo

# Create a repo on Hugging Face Hub
username = VioletConfig.USERNAME
repo_id=f"{username}/{VioletConfig.SAVE_SEGMENT_DIR.split('/')[-1].split('_')[0]}-Encoder-Violet-Captions"
repo_url = create_repo(repo_id=repo_id, private=True, exist_ok=True,repo_type="dataset")
print(f"Dataset repository URL: {repo_url}")

Dataset repository URL: https://huggingface.co/datasets/MahmoodAnaam/VQAv2-VinVL-BiT-Captions


In [None]:
# Push the dataset to the repo
merged_dataset.push_to_hub(repo_id=repo_id)
print(f"Dataset pushed to {repo_id}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to MahmoodAnaam/VQAv2-VinVL-BiT-Captions


In [None]:
from datasets import load_dataset

ds = load_dataset(repo_id)
ds

README.md:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
        num_rows: 4
    })
})