<a href="https://colab.research.google.com/github/Mahmood-Anaam/BiT-ImageCaptioning/blob/last/notebooks/features_extraction_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Features Extraction Demo



---



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## BiT Install

In [None]:
%cd /content
!git clone https://github.com/Mahmood-Anaam/BiT-ImageCaptioning.git
%cd /content/BiT-ImageCaptioning
!pip install -e . --quiet
import IPython
app = IPython.Application.instance()
_=app.kernel.do_shutdown(True)

/content
Cloning into 'BiT-ImageCaptioning'...
remote: Enumerating objects: 744, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 744 (delta 63), reused 61 (delta 24), pack-reused 626 (from 1)[K
Receiving objects: 100% (744/744), 446.81 KiB | 13.54 MiB/s, done.
Resolving deltas: 100% (309/309), done.
/content/BiT-ImageCaptioning
  Preparing metadata (setup.py) ... [?25l[?25hdone


## Import all libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import torch
from PIL import Image
from bit_image_captioning.feature_extractors.vinvl import VinVLFeatureExtractor
from bit_image_captioning.pipelines.bert_pipeline import BiTImageCaptioningPipeline
from bit_image_captioning.datasets.ok_vqa_dataset import OKVQADataset
from bit_image_captioning.datasets.ok_vqa_dataloader import OKVQADataLoader

## Configuration

In [None]:
class BiTConfig:
    """
    Configuration class for BiTImageCaptioningPipeline.
    This class provides all the necessary settings for initializing and running the pipeline.
    """

    # General settings
    checkpoint = "/content/BiT-ImageCaptioning/src/bit_image_captioning/pretrained_model"  # Path to the pretrained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Device for computation (GPU/CPU)

    # Dataset settings
    dataset_path = "MahmoodAnaam/ok-vqa-ar-en-2"  # Path or name of the dataset
    language = "ar"  # Language for questions/answers ("ar" for Arabic, "en" for English)
    split = "validation"  # Dataset split to use ("train", "validation", "test")

    # Image and object detection settings
    add_od_labels = True  # Whether to add object detection labels to input
    max_img_seq_length = 50  # Maximum sequence length for image features

    # Text input settings
    max_seq_length = 70  # Maximum sequence length for text input
    max_seq_a_length = 40  # Maximum sequence length for primary text (e.g., question)
    is_train = False  # Whether the configuration is for training or inference
    mask_prob = 0.15  # Probability of masking tokens during training
    max_masked_tokens = 3  # Maximum number of tokens to mask in a single sequence

    # DataLoader settings
    batch_size = 2  # Number of samples per batch
    num_workers = 1  # Number of workers for data loading
    shuffle = False  # Whether to shuffle the dataset
    pin_memory = False  # Whether to use pinned memory (for CUDA optimization)
    drop_last = False  # Whether to drop the last incomplete batch
    seed = 42  # Random seed for reproducibility

    # Generation settings
    is_decode = True  # Enable decoding (generation mode)
    do_sample = False  # Whether to use sampling for generation
    bos_token_id = None  # Beginning of sentence token ID (will be set by tokenizer)
    pad_token_id = None  # Padding token ID (will be set by tokenizer)
    eos_token_ids = None  # End of sentence token ID(s) (will be set by tokenizer)
    mask_token_id = None  # Masking token ID (will be set by tokenizer)
    max_gen_length = 50  # Maximum length for generated text
    num_beams = 5  # Number of beams for beam search
    temperature = 1.0  # Temperature for sampling (lower values make output more deterministic)
    top_k = 50  # Top-k sampling (0 disables it)
    top_p = 1.0  # Top-p (nucleus) sampling (0 disables it)
    repetition_penalty = 1.0  # Penalty for repeating words (1.0 disables it)
    length_penalty = 1.0  # Penalty for sequence length (used in beam search)
    num_return_sequences = 1  # Number of sequences to return
    num_keep_best = 3  # Number of best sequences to keep

    # Constrained Beam Search (CBS) settings
    use_cbs = False  # Whether to use constrained beam search
    min_constraints_to_satisfy = 0  # Minimum number of constraints to satisfy (if CBS is enabled)

# ........................................................................................
# download Image captioning model
!git lfs install
!git clone https://huggingface.co/jontooy/AraBERT32-Flickr8k $BiTConfig.checkpoint

Git LFS initialized.
Cloning into '/content/BiT-ImageCaptioning/src/bit_image_captioning/pretrained_model'...
remote: Enumerating objects: 19, done.[K
remote: Total 19 (delta 0), reused 0 (delta 0), pack-reused 19 (from 1)[K
Unpacking objects: 100% (19/19), 259.13 KiB | 6.03 MiB/s, done.


## OKVQA Dataset

In [None]:
!pip install datasets --quiet

In [None]:
from huggingface_hub import login
from google.colab import userdata
from datasets import load_dataset

In [None]:
# Login to Hugging Face
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
dataset = load_dataset(BiTConfig.dataset_path,split=BiTConfig.split)
dataset

README.md:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/373M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

validation-00000-of-00002.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

validation-00001-of-00002.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9009 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5046 [00:00<?, ? examples/s]

Dataset({
    features: ['metadata', 'image', 'question', 'answers'],
    num_rows: 5046
})

In [None]:
print(dataset[0])

{'metadata': {'image_id': 297147, 'question_id': 2971475, 'question_type': 'one', 'answer_type': 'other', 'confidence': 3}, 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7EBAF03CDCF0>, 'question': {'en': 'What sport can you use this for?', 'ar': 'في أي رياضة يمكنك استخدام هذا؟'}, 'answers': {'en': ['race', 'race', 'race', 'race', 'race', 'race', 'motocross', 'motocross', 'ride', 'ride'], 'ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'raw_en': ['racing', 'racing', 'racing', 'racing', 'racing', 'racing', 'motocross', 'motocross', 'riding', 'riding'], 'raw_ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'confidence': ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'], 'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}}


## Extract Features

In [None]:
from bit_image_captioning.feature_extractors.vinvl import VinVLFeatureExtractor
from bit_image_captioning.pipelines.bert_pipeline import BiTImageCaptioningPipeline

feature_extractor = VinVLFeatureExtractor(add_od_labels=BiTConfig.add_od_labels)
pipeline = BiTImageCaptioningPipeline(BiTConfig)

downloading vinvl_vg_x152c4.pth: 100%|█████████▉| 579565/579565.927734375 [00:12<00:00, 45546.11it/s]
downloading VG-SGG-dicts-vgoi6-clipped.json: 100%|█████████▉| 107/107.1904296875 [00:00<00:00, 3631.23it/s]


In [None]:
def extract_features(examples):
  try:
    examples["features"] = feature_extractor(examples["image"])
  except Exception as e:
    examples["features"] = [None]*len(examples)
    print(f"Error while extracting features: {e}")
  return examples


# ..................................................

def features_captions(examples):
  try:
    features,captions = pipeline(examples["image"])
    examples["features"] = features
    examples["captions"] = captions
  except Exception as e:
    examples["captions"] = [None] * len(examples)
    examples["features"]  = [None] * len(examples)
    print(f"Error while extracting features: {e}")
  return examples

# ..................................................

rng = range(0, 2523)
# rng = range(2523, 5046)

ds  = dataset.select(rng).map(features_captions,
                              batched=True,
                              batch_size=20,
                              )

checkpoint_dir = f"/content/drive/MyDrive/OKVQA_Features_Checkpoint_{min(rng)}_{max(rng)}"
ds.save_to_disk(checkpoint_dir)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
ds[0].keys()

dict_keys(['metadata', 'image', 'question', 'answers', 'features', 'captions'])

In [None]:
ds[0]['features'].keys()

dict_keys(['boxes', 'classes', 'img_feats', 'od_labels', 'scores', 'spatial_features'])

In [None]:
ds[0]['captions'][0].keys()

dict_keys(['caption', 'confidence'])

In [None]:
ds[0]['captions']

[{'caption': 'امراة ترتدي بدلة سوداء وبيضاء تركب دراجتها في الشارع',
  'confidence': 0.33103856444358826},
 {'caption': 'امراة ترتدي بدلة سوداء وبيضاء تركب دراجتها في موقف للسيارات',
  'confidence': 0.3305349051952362},
 {'caption': 'امراة ترتدي بدلة سوداء وبيضاء تركب دراجة ثلاثية العجلات',
  'confidence': 0.32602453231811523}]



---



## Huggingface Hub

In [None]:
from datasets import load_from_disk

# rng = range(0, 2523)
rng = range(2523, 5046)

checkpoint_dir = f"/content/drive/MyDrive/OKVQA_Features_Checkpoint_{min(rng)}_{max(rng)}"
ds = load_from_disk(checkpoint_dir)
ds

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'features', 'captions'],
    num_rows: 2523
})

In [None]:
from huggingface_hub import Repository, create_repo


# Create a repo on Hugging Face Hub
username = "MahmoodAnaam" # change username
repo_name = checkpoint_dir.split("/")[-1]

repo_id = f"{username}/{repo_name}"
repo_url = create_repo(repo_id=repo_id, private=True, exist_ok=True,repo_type="dataset")
print(f"Dataset repository URL: {repo_url}")

Dataset repository URL: https://huggingface.co/datasets/MahmoodAnaam/OKVQA_Features_Checkpoint_2523_5045


In [None]:
# Push the dataset to the repo
ds.push_to_hub(repo_id=repo_id)
print(f"Dataset pushed to {repo_id}")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Dataset pushed to MahmoodAnaam/OKVQA_Features_Checkpoint_2523_5045


In [None]:
from datasets import load_dataset

ds = load_dataset(repo_id,split='validation')
ds

Dataset({
    features: ['metadata', 'image', 'question', 'answers', 'features', 'captions'],
    num_rows: 2523
})

In [None]:
ds['captions']

[[{'caption': 'طفل صغير يجلس في صف من البقالة',
   'confidence': 0.40754425525665283},
  {'caption': 'طفل صغير يجلس على طاولة في مطعم',
   'confidence': 0.3528929650783539},
  {'caption': 'طفل صغير يجلس في صف من الكبايق',
   'confidence': 0.3246246874332428}],
 [{'caption': 'ثلاثة اشخاص يرتدون بدلات رعاة البقر يجلسون في سيارة لعبة',
   'confidence': 0.35519036650657654},
  {'caption': 'رجل وامراة يجلسان في سيارة لعبة',
   'confidence': 0.3486502170562744},
  {'caption': 'ثلاثة اشخاص يرتدون بدلات رعاة البقر يجلسون في صف من عربات الكرنفال',
   'confidence': 0.3453269898891449}],
 [{'caption': 'شخص ياكل رقائق البطاطس المقلية',
   'confidence': 0.47354820370674133},
  {'caption': 'شخص يرتدي سماعات الراس يجلس على طاولة',
   'confidence': 0.4224885106086731},
  {'caption': 'شخص ياكل رقائق البطاطس المقلية في مطعم',
   'confidence': 0.36861342191696167}],
 [{'caption': 'رجل في قميص ازرق يجلس في صف من البقالة',
   'confidence': 0.47869253158569336},
  {'caption': 'رجل في قميص ازرق يجلس في صف من