In [1]:
import sys
sys.path.append('./airflow')

import os
from functions.images.detr.util.features import get_detr_model, extract_detr_features
from functions.images.yolo.util.features import extract_yolo_features, get_Net_yolov4
from functions.images.detr.util.transform import reshape

import pymongo
import json
import requests
import numpy as np
from tqdm import tqdm
import base64
from elasticsearch import Elasticsearch

In [2]:
with open("./airflow/config/env.json", "r") as file:
    config = json.load(file)
    mongo_url = config['mongodb']['MONGO_ATLAS_PYTHON_GCP']
    hugg_index_key = config['elastic']['HUGGINGFACE_INDEX_KEY']
    hugg_host = config['elastic']['ELASTIC_HOST']

In [35]:
# Tải mô hình DETR
detr_model, postprocessor = get_detr_model(pretrained=True)
yolo_model = get_Net_yolov4("./airflow/functions/images/yolo/model/yolov4.weights", "./airflow/functions/images/yolo//model/yolov4.cfg")
# Khởi tạo một dictionary để lưu trữ các đặc trưng của ảnh
caches = []

# Lặp qua từng ảnh trong thư mục
with pymongo.MongoClient(mongo_url) as client:
    db = client['imcp']
    pipeline = [{
            '$sort': {'url': 1}
        }, {
            '$project': {'created_time': 0, 'publisher': 0, '_id': 0 }
        }, {
            '$limit': 8000
        }
    ]
    documents = db['refined'].aggregate(pipeline)
    count = 0
    for doc in tqdm(documents):
        try:
            response = requests.get(doc['url'], timeout=0.5)
             # Trích xuất đặc trưng từ YOLO
            yolo_features = extract_yolo_features(yolo_model, response.content)
            doc['yolov4_encode'] = yolo_features #base64.b64encode(yolo_features)
            
            # Trích xuất đặc trưng từ DETR
            image_tensor = reshape(response.content)
            detr_features = extract_detr_features(image_tensor, detr_model)
            doc['detr_encode'] = detr_features #base64.b64encode(detr_features)
            caches.append(doc)
            
            print(yolo_features.shape)
            
            count += 1
            if count == 2:
                break
            
            # with Elasticsearch(hosts=hugg_host, api_key=hugg_index_key) as es:
            #     es.update(index="huggingface-index", doc=doc, id=doc['url'][-16:-4], doc_as_upsert=True, upsert=doc)    
        except Exception as exc:
            continue

2it [00:05,  3.00s/it]

(904995,)


2it [00:08,  4.20s/it]

(904995,)





In [34]:
for cache in caches:
    print(cache['yolov4_encode'])
    break

[0.00964143 0.01011501 0.02303592 ... 0.         0.         0.        ]


In [37]:
with Elasticsearch(hosts=hugg_host, api_key=hugg_index_key) as es:
    # "caption", "short_caption", "caption_tokens", "short_caption_tokens"
    query = {
        "query": {
            "match_all": {}
        },
        "_source": ["url", "yolov4_encode", "detr_encode"]
    }
    res = es.search(index='huggingface-index', body=query)
    for hit in res['hits']['hits']:
        yolov4_encode = hit['_source']['detr_encode']
        decoded_data = np.frombuffer(base64.b64decode(yolov4_encode), dtype=np.float32)
        print(decoded_data.shape)

(400,)
