<a href="https://colab.research.google.com/github/Mahmood-Anaam/Violet/blob/main/notebooks/features_extraction_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Features Extraction Demo



---



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## BiT Install

In [2]:
%cd /content
!git clone https://github.com/Mahmood-Anaam/BiT-ImageCaptioning.git
%cd /content/BiT-ImageCaptioning
!pip install -e . --quiet
import IPython
app = IPython.Application.instance()
_=app.kernel.do_shutdown(True)

/content
Cloning into 'BiT-ImageCaptioning'...
remote: Enumerating objects: 760, done.[K
remote: Counting objects: 100% (134/134), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 760 (delta 75), reused 60 (delta 23), pack-reused 626 (from 1)[K
Receiving objects: 100% (760/760), 609.09 KiB | 8.34 MiB/s, done.
Resolving deltas: 100% (321/321), done.
/content/BiT-ImageCaptioning
  Preparing metadata (setup.py) ... [?25l[?25hdone


## Import all libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import torch
from PIL import Image
from bit_image_captioning.feature_extractors.vinvl import VinVLFeatureExtractor
from bit_image_captioning.pipelines.bert_pipeline import BiTImageCaptioningPipeline
from bit_image_captioning.datasets.ok_vqa_dataset import OKVQADataset
from bit_image_captioning.datasets.ok_vqa_dataloader import OKVQADataLoader

## Configuration



```
dataset_size , num_segments = 214354 , 10
segment_size, remainder = divmod(dataset_size, num_segments)
size_segments = [range(i * segment_size + min(i, remainder), (i + 1) * segment_size + min(i + 1, remainder)) for i in range(10)]
size_segments
```



In [1]:
import torch
class BiTConfig:
    """
    Configuration class for BiTImageCaptioningPipeline.
    This class provides all the necessary settings for initializing and running the pipeline.
    """

    # General settings
    checkpoint = "/content/BiT-ImageCaptioning/src/bit_image_captioning/pretrained_model"  # Path to the pretrained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Device for computation (GPU/CPU)

    # Dataset settings
    dataset_path = "MahmoodAnaam/vqav2-ar-en-validation-2"  # Path or name of the dataset
    language = "ar"  # Language for questions/answers ("ar" for Arabic, "en" for English)
    split = "validation"  # Dataset split to use ("train", "validation", "test")
    save_segment_dir = "/content/drive/MyDrive/VQAv2_Features_Checkpoint"  # Directory to save extracted features



    size_segments =[
                    range(0,21436),     # segment 0
                    range(21436, 42872), # segment 1
                    range(42872, 64308),
                    range(64308, 85744),
                    range(85744, 107179),
                    range(107179, 128614),
                    range(128614, 150049),
                    range(150049, 171484),
                    range(171484, 192919),# segment 8
                    range(192919, 214354) #  segment 9
                  ]

    current_index_segment = 0  # Index of the current segment being processed
    username = "MahmoodAnaam"  # Username for Hugging Face

    # Image and object detection settings
    add_od_labels = True  # Whether to add object detection labels to input
    max_img_seq_length = 50  # Maximum sequence length for image features

    # Text input settings
    max_seq_length = 70  # Maximum sequence length for text input
    max_seq_a_length = 40  # Maximum sequence length for primary text (e.g., question)
    is_train = False  # Whether the configuration is for training or inference
    mask_prob = 0.15  # Probability of masking tokens during training
    max_masked_tokens = 3  # Maximum number of tokens to mask in a single sequence

    # DataLoader settings
    batch_size = 2  # Number of samples per batch
    num_workers = 1  # Number of workers for data loading
    shuffle = False  # Whether to shuffle the dataset
    pin_memory = False  # Whether to use pinned memory (for CUDA optimization)
    drop_last = False  # Whether to drop the last incomplete batch
    seed = 42  # Random seed for reproducibility

    # Generation settings
    is_decode = True  # Enable decoding (generation mode)
    do_sample = False  # Whether to use sampling for generation
    bos_token_id = None  # Beginning of sentence token ID (will be set by tokenizer)
    pad_token_id = None  # Padding token ID (will be set by tokenizer)
    eos_token_ids = None  # End of sentence token ID(s) (will be set by tokenizer)
    mask_token_id = None  # Masking token ID (will be set by tokenizer)
    max_gen_length = 50  # Maximum length for generated text
    num_beams = 5  # Number of beams for beam search
    temperature = 1.0  # Temperature for sampling (lower values make output more deterministic)
    top_k = 50  # Top-k sampling (0 disables it)
    top_p = 1.0  # Top-p (nucleus) sampling (0 disables it)
    repetition_penalty = 1.0  # Penalty for repeating words (1.0 disables it)
    length_penalty = 1.0  # Penalty for sequence length (used in beam search)
    num_return_sequences = 1  # Number of sequences to return
    num_keep_best = 3  # Number of best sequences to keep

    # Constrained Beam Search (CBS) settings
    use_cbs = False  # Whether to use constrained beam search
    min_constraints_to_satisfy = 0  # Minimum number of constraints to satisfy (if CBS is enabled)

# ........................................................................................
# download Image captioning model
!git lfs install
!git clone https://huggingface.co/jontooy/AraBERT32-Flickr8k $BiTConfig.checkpoint

Git LFS initialized.
Cloning into '/content/BiT-ImageCaptioning/src/bit_image_captioning/pretrained_model'...
remote: Enumerating objects: 19, done.[K
remote: Total 19 (delta 0), reused 0 (delta 0), pack-reused 19 (from 1)[K
Unpacking objects: 100% (19/19), 259.13 KiB | 1.53 MiB/s, done.


## Dataset

In [3]:
!pip install datasets --quiet

In [4]:
from huggingface_hub import login
from google.colab import userdata
from datasets import load_dataset

In [5]:
# Login to Hugging Face
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [6]:
dataset = load_dataset(BiTConfig.dataset_path,split=BiTConfig.split,trust_remote_code=True)
dataset

vqav2-ar-en-validation-2.py:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

(…)nEnded_mscoco_val2014_questions.json.zip:   0%|          | 0.00/6.47M [00:00<?, ?B/s]

v2_mscoco_val2014_annotations.json.zip:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.65G [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer'],
    num_rows: 214354
})

In [11]:
print(dataset[0])

{'metadata': {'image_id': 262148, 'question_id': 262148000, 'question_type': 'none of the above', 'answer_type': 'other'}, 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x512 at 0x7BCB685CBDF0>, 'question': {'en': 'Where is he looking?', 'ar': 'أين ينظر؟'}, 'answers': {'en': ['down', 'down', 'at table', 'skateboard', 'down', 'table', 'down', 'down', 'down', 'down'], 'ar': ['تحت', 'تحت', 'على الطاولة', 'لوح التزلج', 'تحت', 'طاولة', 'تحت', 'تحت', 'تحت', 'تحت'], 'confidence': ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'], 'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, 'multiple_choice_answer': {'en': 'down', 'ar': 'تحت'}}


## Extract Features

In [12]:
from bit_image_captioning.feature_extractors.vinvl import VinVLFeatureExtractor
from bit_image_captioning.pipelines.bert_pipeline import BiTImageCaptioningPipeline

feature_extractor = VinVLFeatureExtractor(add_od_labels=BiTConfig.add_od_labels)
pipeline = BiTImageCaptioningPipeline(BiTConfig)

downloading vinvl_vg_x152c4.pth: 100%|█████████▉| 579565/579565.927734375 [00:17<00:00, 33913.17it/s]
downloading VG-SGG-dicts-vgoi6-clipped.json: 100%|█████████▉| 107/107.1904296875 [00:00<00:00, 5684.35it/s]


In [14]:
def extract_features(examples):
  try:
    examples["features"] = feature_extractor(examples["image"])
  except Exception as e:
    examples["features"] = [None]*len(examples)
    print(f"Error while extracting features: {e}")
  return examples


# ..................................................

def features_captions(examples):
  try:
    features,captions = pipeline(examples["image"])
    examples["features"] = features
    examples["captions"] = captions
  except Exception as e:
    examples["captions"] = [None] * len(examples)
    examples["features"]  = [None] * len(examples)
    print(f"Error while extracting features: {e}")
  return examples

# ..................................................

rng = BiTConfig.size_segments[BiTConfig.current_index_segment]


ds  = dataset.select(rng).map(features_captions,
                              batched=True,
                              batch_size=2,
                              )

checkpoint_dir = f"{BiTConfig.save_segment_dir}_{min(rng)}_{max(rng)}"
ds.save_to_disk(checkpoint_dir)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [15]:
ds[0].keys()

dict_keys(['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'])

In [16]:
ds[0]['features'].keys()

dict_keys(['boxes', 'classes', 'img_feats', 'od_labels', 'scores', 'spatial_features'])

In [17]:
ds[0]['captions'][0].keys()

dict_keys(['caption', 'confidence'])

In [18]:
ds[0]['captions']

[{'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به',
  'confidence': 0.5949601531028748},
 {'caption': 'رجل يقوم بخدعة لوح التزلج الخاص به',
  'confidence': 0.5577893257141113},
 {'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به امام حشد من الناس',
  'confidence': 0.5361749529838562}]



---



## Huggingface Hub

In [19]:
from datasets import load_from_disk

rng = BiTConfig.size_segments[BiTConfig.current_index_segment]

checkpoint_dir = f"{BiTConfig.save_segment_dir}_{min(rng)}_{max(rng)}"
ds = load_from_disk(checkpoint_dir)
ds

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
    num_rows: 4
})

In [20]:
from huggingface_hub import Repository, create_repo

# Create a repo on Hugging Face Hub
username = BiTConfig.username
repo_name = checkpoint_dir.split("/")[-1]

repo_id = f"{username}/{repo_name}"
repo_url = create_repo(repo_id=repo_id, private=True, exist_ok=True,repo_type="dataset")
print(f"Dataset repository URL: {repo_url}")

Dataset repository URL: https://huggingface.co/datasets/MahmoodAnaam/VQAv2_Features_Checkpoint_0_3


In [21]:
# Push the dataset to the repo
ds.push_to_hub(repo_id=repo_id)
print(f"Dataset pushed to {repo_id}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to MahmoodAnaam/VQAv2_Features_Checkpoint_0_3


In [22]:
from datasets import load_dataset

ds = load_dataset(repo_id,split=BiTConfig.split)
ds

README.md:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/4 [00:00<?, ? examples/s]

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
    num_rows: 4
})

In [23]:
ds['captions']

[[{'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به',
   'confidence': 0.5949601531028748},
  {'caption': 'رجل يقوم بخدعة لوح التزلج الخاص به',
   'confidence': 0.5577893257141113},
  {'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به امام حشد من الناس',
   'confidence': 0.5361749529838562}],
 [{'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به',
   'confidence': 0.5949601531028748},
  {'caption': 'رجل يقوم بخدعة لوح التزلج الخاص به',
   'confidence': 0.5577893257141113},
  {'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به امام حشد من الناس',
   'confidence': 0.5361749529838562}],
 [{'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به',
   'confidence': 0.5949601531028748},
  {'caption': 'رجل يقوم بخدعة لوح التزلج الخاص به',
   'confidence': 0.5577893257141113},
  {'caption': 'رجل يقوم بخدعة على لوح التزلج الخاص به امام حشد من الناس',
   'confidence': 0.5361749529838562}],
 [{'caption': 'صبي صغير يشرب من نافورة', 'confidence': 0.4507277011871338},
  {'caption': 'طفل صغير يشرب من ك



---



## Merge Dataset Segments

In [24]:
from datasets import load_dataset

username = BiTConfig.username
base_name = BiTConfig.save_segment_dir.split('/')[-1]

repos_names = [ f"{username}/{base_name}_{min(rng)}_{max(rng)}" for rng in BiTConfig.size_segments]

datasets = [load_dataset(repo_id,split=BiTConfig.split) for repo_id in repos_names]
datasets


[Dataset({
     features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
     num_rows: 4
 })]

In [25]:
from datasets import concatenate_datasets

merged_dataset = concatenate_datasets(datasets,split=BiTConfig.split)
merged_dataset

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
    num_rows: 4
})

In [26]:
from huggingface_hub import Repository, create_repo

# Create a repo on Hugging Face Hub
username = BiTConfig.username
repo_id=f"{username}/{BiTConfig.save_segment_dir.split('/')[-1].split('_')[0]}-VinVL-BiT-Captions"
repo_url = create_repo(repo_id=repo_id, private=True, exist_ok=True,repo_type="dataset")
print(f"Dataset repository URL: {repo_url}")

Dataset repository URL: https://huggingface.co/datasets/MahmoodAnaam/VQAv2-VinVL-BiT-Captions


In [27]:
# Push the dataset to the repo
merged_dataset.push_to_hub(repo_id=repo_id)
print(f"Dataset pushed to {repo_id}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to MahmoodAnaam/VQAv2-VinVL-BiT-Captions


In [28]:
from datasets import load_dataset

ds = load_dataset(repo_id)
ds

README.md:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['metadata', 'image', 'question', 'answers', 'multiple_choice_answer', 'features', 'captions'],
        num_rows: 4
    })
})