# Embedding vectors

In [10]:
import sys
sys.path.append('./airflow')

import os
from functions.images.detr.util.features import get_detr_model, extract_detr_features
from functions.images.yolo.util.features import extract_yolo_features, get_Net_yolov4
from functions.images.detr.util.transform import reshape

import pymongo
import json
import requests
import numpy as np
from tqdm import tqdm
import base64
import pickle
import time
from elasticsearch import Elasticsearch

In [2]:
with open("./airflow/config/env.json", "r") as file:
    config = json.load(file)
    mongo_url = config['mongodb']['MONGO_ATLAS_PYTHON_GCP']
    hugg_index_key = config['elastic']['HUGGINGFACE_INDEX_KEY']
    hugg_host = config['elastic']['ELASTIC_HOST']

In [12]:
# Tải mô hình DETR
detr_model, postprocessor = get_detr_model(pretrained=True)
yolo_model = get_Net_yolov4("./airflow/functions/images/yolo/model/yolov4.weights", "./airflow/functions/images/yolo//model/yolov4.cfg")
# Khởi tạo một dictionary để lưu trữ các đặc trưng của ảnh
yolo_caches = {}
detr_caches = {}

# Lặp qua từng ảnh trong thư mục
with pymongo.MongoClient(mongo_url) as client:
    db = client['imcp']
    pipeline = [{
            '$sort': {'url': 1}
        }, {
            '$project': {'created_time': 0, 'publisher': 0, '_id': 0 }
        }, {
            '$limit': 8000
        }
    ]
    documents = db['refined'].aggregate(pipeline)
    count = 0
    for doc in tqdm(documents):
        try:
            response = requests.get(doc['url'], timeout=1)
            # Trích xuất đặc trưng từ YOLO
            yolo_features = extract_yolo_features(yolo_model, response.content)
            yolov4_encode = yolo_features
            
            # # Trích xuất đặc trưng từ DETR
            # image_tensor = reshape(response.content)
            # detr_features = extract_detr_features(image_tensor, detr_model)
            # detr_encode = detr_features
            
            image_id = doc['url'][-16:-4]
            yolo_caches[image_id] = yolov4_encode
            # detr_caches[image_id] = yolov4_encode
                
        except ConnectionError:
            print("Lỗi tải dữ liệu...")
            for attempt in range(0, 3):
                try:
                    response = requests.get(doc['url'], timeout=1)
                    # Trích xuất đặc trưng từ YOLO
                    yolo_features = extract_yolo_features(yolo_model, response.content)
                    yolov4_encode = yolo_features
                    image_id = doc['url'][-16:-4]
                    yolo_caches[image_id] = yolov4_encode
                    break  # Thành công, thoát khỏi vòng lặp thử lại
                except ConnectionError as e:
                    print(f"Tải lại dữ liệu từ {doc['url']} (lần {attempt+1}/{3}): {e}")
                    time.sleep(2)  # Chờ đợi trước khi thử lại
                    
        count += 1
        if count == 1001:
            with open(f"./airflow/data/HuggingFace/yolo_embedding_{count}.pkl", 'wb') as f:
                pickle.dump(yolo_caches, f)
                
            # with open(f"./airflow/data/HuggingFace/detr_embedding_{partition}.pkl", 'wb') as f:
            #     pickle.dump(detr_caches, f)
            count = 0
            yolo_caches = {}
            # detr_caches = {}

286it [12:43,  2.67s/it]


KeyboardInterrupt: 

In [15]:
print(len(yolo_caches['000000000009']))
# print(len(detr_caches))
# for cache in caches:
#     print(cache['yolov4_encode'])
#     break

904995


# Cloud Storage

In [4]:
from google.cloud.storage import Client, transfer_manager
from typing import List


def set_bucket_public_iam(
    bucket_name:str="embedding-vectors",
    roles:List[dict]=[{"role": "roles/storage.objectViewer", "members": "allUsers"}]
):
    """Set a public IAM Policy to bucket"""
    # bucket_name = "your-bucket-name"

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    policy = bucket.get_iam_policy(requested_policy_version=3)
    policy.bindings += roles

    bucket.set_iam_policy(policy)
    print(f"Bucket {bucket.name} is now publicly readable")


def upload_blob(
    bucket_name, source_file_name, destination_blob_name, content_type
):
    """Uploads a file to the bucket."""
    
    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    generation_match_precondition = 0
    try:
        blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)
    except Exception as exc:
        raise Exception(exc)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")
    

def upload_many_blobs(
    bucket_name, filenames, source_directory="./data", workers=8
):
    """Upload every file in a list to a bucket, concurrently in a process pool.

    Each blob name is derived from the filename, not including the
    `source_directory` parameter. For complete control of the blob name for each
    file (and other aspects of individual blob metadata), use
    transfer_manager.upload_many() instead.
    """

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    results = transfer_manager.upload_many_from_filenames(bucket, filenames, source_directory=source_directory, max_workers=workers, content_type=content_type)

    for name, result in zip(filenames, results):
        # The results list is either `None` or an exception for each filename in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to upload {} due to exception: {}".format(name, result))
        else:
            print("Uploaded {} to {}.".format(name, bucket.name))



In [5]:
upload_blob(
    bucket_name="embedding-vectors",
    source_file_name="./airflow/data/HuggingFace/lvis_detr_embedding.pkl",
    destination_blob_name="images/lvis_detr_embedding.pkl",
    content_type="application/octet-stream"
)

# upload_blob(
#     bucket_name="embedding-vectors",
#     source_file_name="./airflow/data/HuggingFace/lvis_caption_url.parquet",
#     destination_blob_name="images/lvis_raw.parquet",
#     content_type="application/octet-stream"
# )

# download_blob(
#     bucket_name="embedding-vectors",
#     source_blob_name="images/lvis_raw.parquet",
#     destination_file_name="./samples/lvis_raw.parquet"
# )

File ./airflow/data/HuggingFace/lvis_detr_embedding.pkl uploaded to images/lvis_detr_embedding.pkl.


In [18]:
import pandas as pd

df = pd.read_parquet("https://storage.googleapis.com/embedding-vectors/images/lvis_raw.parquet", engine="pyarrow")
df.head()

Unnamed: 0,url,caption,short_caption
0,http://images.cocodataset.org/val2017/00000003...,"a kitchen with wooden cabinets on the walls, a...","Well-organized kitchen with wooden cabinets, a..."
1,http://images.cocodataset.org/val2017/00000025...,"a street scene with construction scaffolding, ...","Construction scaffolding, three individuals, s..."
2,http://images.cocodataset.org/val2017/00000008...,"multiple people wearing sweatshirts, a person ...","Multiple people wearing sweatshirts, a person ..."
3,http://images.cocodataset.org/val2017/00000017...,a blue bicycle parked alongside a city street ...,"A blue bicycle in an urban setting, parked alo..."
4,http://images.cocodataset.org/val2017/00000040...,"a bathroom with various objects, including a t...","A worn bathroom contains a toilet, sink, showe..."
