In [1]:
from ultralytics import YOLO
yolo_model = YOLO('yolov8x.pt')

import torchreid
reid_model = torchreid.models.build_model(
    name='mudeep',
    num_classes=1000,
    pretrained=True
)

from PIL import Image
import torch
from torchvision import transforms



In [30]:
def detect_objects(image_path, yolo_model):
    # Load ảnh
    img = Image.open(image_path).convert('RGB')

    # Sử dụng YOLO để xác định vùng chứa object
    results = yolo_model(img)

    for r in results:
        im_array = r.plot()  # plot a BGR numpy array of predictions
        im = Image.fromarray(im_array[..., ::-1])  # RGB PIL image
        im.show()  # show image
        # im.save('results.jpg')  # save image

    # Lấy thông tin về object và bounding box
    boxes = results[0].boxes.xywh   # tensor

    # Cắt và lưu các vùng chứa object
    object_images = []
    for box in boxes:
        xmin, ymin, width, height = map(int, box)
        xmax = xmin + width
        ymax = ymin + height
        object_img = img.crop((xmin, ymin, xmax, ymax))
        object_images.append(object_img)

    return object_images

# def extract_features(image, reid_model):
#     """image is <class 'PIL.Image.Image'>"""
#     # Load ảnh và tiền xử lý nó để phù hợp với mô hình
#     img = image.convert('RGB') # make 3 channels
#     preprocess = transforms.Compose([
#         transforms.Resize((256, 128)), # make size (256, 128)
#         transforms.ToTensor(),  # make tensor (3, 256, 128)
#     ])
#     img = preprocess(img).unsqueeze(0)

#     # Đưa ảnh qua mô hình để lấy đặc trưng
#     with torch.no_grad():
#         # features = reid_model.featuremaps(img)
#         features = reid_model.forward(img)

#     return features

def extract_features(image, reid_model):
    """
    Extract features from an image using a reid_model.

    Parameters:
        image (PIL.Image.Image): Input image.
        reid_model: TorchReID model.

    Returns:
        features (torch.Tensor): Extracted features.
    """
    # Preprocess the image
    preprocess = transforms.Compose([
        transforms.Resize((256, 128)),  # Resize to the model's input size
        transforms.ToTensor(),
        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    img_tensor = preprocess(image.convert('RGB'))
    img_tensor = torch.unsqueeze(img_tensor, 0)  # Add batch dimension

    # Set the model to evaluation mode
    reid_model.eval()

    # Disable gradient computation to speed up the process
    with torch.no_grad():
        # Forward pass to extract features
        features = reid_model.forward(img_tensor)

    return features

In [26]:
# Đường dẫn đến ảnh 1 và ảnh 2
image_path_1 = '0.jpg'
image_path_2 = '1.jpg'
# Dùng YOLO để xác định vùng chứa object trên ảnh 1 và ảnh 2
object_images_1 = detect_objects(image_path_1, yolo_model)
object_images_2 = detect_objects(image_path_2, yolo_model)




0: 320x640 4 persons, 1 car, 2273.5ms
Speed: 4.3ms preprocess, 2273.5ms inference, 2.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 persons, 2625.8ms
Speed: 4.2ms preprocess, 2625.8ms inference, 2.1ms postprocess per image at shape (1, 3, 320, 640)




(loupe:31216): GLib-GObject-CRITICAL **: 20:46:30.696: g_object_weak_unref: couldn't find weak ref 0x7fa98ad16970((nil))

(loupe:31216): GLib-GObject-CRITICAL **: 20:47:14.331: g_object_weak_unref: couldn't find weak ref 0x7fa98ad16970((nil))


In [15]:
print(type(object_images_1[0]))
print(object_images_1[0].size)
print(object_images_1[0])

<class 'PIL.Image.Image'>
(845, 594)
<PIL.Image.Image image mode=RGB size=845x594 at 0x7F8CF0CA6C50>


In [16]:
# Trích xuất vector đặc trưng từ reid_model cho từng vùng chứa object
features_1 = [extract_features(img, reid_model) for img in object_images_1]
features_2 = [extract_features(img, reid_model) for img in object_images_2]

In [17]:
# Tìm hiểu về cấu trúc dữ liệu
print("Số các features 1 và số các features 2:", len(features_1), len(features_2))
print("Kích thước của 1 feature của 1 vùng ảnh - 1 object:", features_1[0].shape)
# print("Kích thước của 1 feature của 1 vùng ảnh - 1 object:", features_2[0].shape)
# for feature in features_1:
#     print("Thông tin 1 feature:", feature)
#     print("Kích thước 1 feature:", len(feature))
#     print("Kiểu dữ liệu của feature:", type(feature))

print("Thông tin 1 feature:", features_1[0])
print("Kích thước 1 feature - tensor có 1 ảnh xử lý khi gọi mô hình:", len(features_1[0]))
print("Kiểu dữ liệu của feature:", type(features_1[0]))


Số các features 1 và số các features 2: 5 4
Kích thước của 1 feature của 1 vùng ảnh - 1 object: torch.Size([1, 4096])
Thông tin 1 feature: tensor([[0.0191, 0.0000, 0.0000,  ..., 0.0251, 0.0059, 0.0000]])
Kích thước 1 feature - tensor có 1 ảnh xử lý khi gọi mô hình: 1
Kiểu dữ liệu của feature: <class 'torch.Tensor'>


In [18]:
print(type(features_1))
print(features_1[0].shape)
print(features_1[0])
# print(features_1[0][0]) # có 256 phần tử như này
# print(features_1[0][0][0])  # có 16 phần tử như này
# print(features_1[0][0][1])  # có 16 phần tử như này
# print(features_1[0][0][0][0])  # có 8 phần tử như này

<class 'list'>
torch.Size([1, 4096])
tensor([[0.0191, 0.0000, 0.0000,  ..., 0.0251, 0.0059, 0.0000]])


In [32]:
# Sử dụng mô hình đã có hoặc đào tạo mô hình phù hợp với đối tượng của bạn
# Ví dụ sử dụng torchvision.models
import torch
from torchvision import models, transforms

def extract_features(image, model):
    preprocess = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # image = Image.open(image_path).convert('RGB')
    input_tensor = preprocess(image)
    input_batch = torch.unsqueeze(input_tensor, 0)

    with torch.no_grad():
        output = model(input_batch)

    return output

# Sử dụng một pre-trained model như ResNet
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()

img1 = Image.open("calcu_1.jpg").convert('RGB')
img2 = Image.open("calcu_2.jpg").convert('RGB')
img3 = Image.open("box.jpg").convert('RGB')
# Dùng YOLO để xác định vùng chứa object trên ảnh 1 và ảnh 2
object_images_1 = detect_objects(img1, yolo_model)
object_images_2 = detect_objects(img2, yolo_model)
object_images_3 = detect_objects(img3, yolo_model)
features1 = extract_features(img1, resnet_model)
features2 = extract_features(img2, resnet_model)
features3 = extract_features(img3, resnet_model)

# So sánh biểu diễn feature
# cosine_similarity = torch.nn.functional.cosine_similarity(features1, features2)

threshold = 0.8
similarity_score = cosine_similarity(features1, features2)
# In kết quả
print(f'features1 and features2: {similarity_score[0, 0]}')
if similarity_score >= threshold:
    print("features1 is similar to features2.")
similarity_score = cosine_similarity(features1, features3)
# In kết quả
print(f'features1 and features3: {similarity_score[0, 0]}')
if similarity_score >= threshold:
    print("features1 is similar to features3.")
similarity_score = cosine_similarity(features3, features2)
# In kết quả
print(f'features3 and features2: {similarity_score[0, 0]}')
if similarity_score >= threshold:
    print("features3 is similar to features2.")






0: 480x640 1 remote, 2604.5ms
Speed: 10.0ms preprocess, 2604.5ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 remote, 3737.5ms
Speed: 5.5ms preprocess, 3737.5ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 3583.1ms
Speed: 6.4ms preprocess, 3583.1ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)
features1 and features2: 0.8383724093437195
features1 is similar to features2.
features1 and features3: 0.6463674306869507
features3 and features2: 0.5626075267791748




















(loupe:34679): GLib-GObject-CRITICAL **: 21:27:35.649: g_object_weak_unref: couldn't find weak ref 0x7f8872f16970((nil))

(loupe:34679): GLib-GObject-CRITICAL **: 21:27:36.524: g_object_weak_unref: couldn't find weak ref 0x7f8872f16970((nil))

(loupe:34679): GLib-GObject-CRITICAL **: 21:27:37.682: g_object_weak_unref: couldn't find weak ref 0x7f8872f16970((nil))


In [36]:
import torch
from torchvision import models, transforms

# Sử dụng một pre-trained model như ResNet
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()

def detect_objects(image, yolo_model):

    # Sử dụng YOLO để xác định vùng chứa object
    results = yolo_model(image)

    for r in results:
        im_array = r.plot()  # plot a BGR numpy array of predictions
        im = Image.fromarray(im_array[..., ::-1])  # RGB PIL image
        im.show()  # show image
        # im.save('results.jpg')  # save image

    # Lấy thông tin về object và bounding box
    boxes = results[0].boxes.xywh   # tensor

    # Cắt và lưu các vùng chứa object
    object_images = []
    for box in boxes:
        xmin, ymin, width, height = map(int, box)
        xmax = xmin + width
        ymax = ymin + height
        object_img = image.crop((xmin, ymin, xmax, ymax))
        object_images.append(object_img)

    return object_images

def extract_features(image, model):
    preprocess = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # image = Image.open(image_path).convert('RGB')
    input_tensor = preprocess(image)
    input_batch = torch.unsqueeze(input_tensor, 0)

    with torch.no_grad():
        output = model(input_batch)

    return output

# Load ảnh
img1 = Image.open('0.jpg').convert('RGB')
img2 = Image.open('1.jpg').convert('RGB')
# Dùng YOLO để xác định vùng chứa object trên ảnh 1 và ảnh 2
object_images_1 = detect_objects(img1, yolo_model)
object_images_2 = detect_objects(img2, yolo_model)

features_1 = [extract_features(img, resnet_model) for img in object_images_1]
features_2 = [extract_features(img, resnet_model) for img in object_images_2]




0: 320x640 4 persons, 1 car, 2169.3ms
Speed: 6.8ms preprocess, 2169.3ms inference, 1.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 4 persons, 2676.6ms
Speed: 12.8ms preprocess, 2676.6ms inference, 1.1ms postprocess per image at shape (1, 3, 320, 640)



(loupe:37928): GLib-GObject-CRITICAL **: 21:46:16.133: g_object_weak_unref: couldn't find weak ref 0x7f6602d16970((nil))

(loupe:38075): GLib-GObject-CRITICAL **: 21:46:19.958: g_object_weak_unref: couldn't find weak ref 0x7f5d0c916970((nil))


In [45]:
from math import sqrt
# threshold = 1 / sqrt(2)
threshold = 0.75
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(features_1)):
    for j in range(len(features_2)):
        # features_1 và features_2 là các đặc trưng được trích xuất từ hai ảnh
        similarity_score = cosine_similarity(features_1[i], features_2[j])

        # In kết quả
        print(f'features_1[{i}] and features_2[{j}]: {similarity_score[0, 0]}')

        if similarity_score >= threshold:
            print(f"features_1[{i}] is similar to features_2[{j}].")

features_1[0] and features_2[0]: 0.6221237182617188
features_1[0] and features_2[1]: 0.49044182896614075
features_1[0] and features_2[2]: 0.4481508731842041
features_1[0] and features_2[3]: 0.5429463386535645
features_1[1] and features_2[0]: 0.7480502724647522
features_1[1] and features_2[1]: 0.7209721207618713
features_1[1] and features_2[2]: 0.5981481075286865
features_1[1] and features_2[3]: 0.7117047905921936
features_1[2] and features_2[0]: 0.6698352098464966
features_1[2] and features_2[1]: 0.6257959008216858
features_1[2] and features_2[2]: 0.560143768787384
features_1[2] and features_2[3]: 0.6474456191062927
features_1[3] and features_2[0]: 0.6051077842712402
features_1[3] and features_2[1]: 0.5055397152900696
features_1[3] and features_2[2]: 0.538964033126831
features_1[3] and features_2[3]: 0.6177788972854614
features_1[4] and features_2[0]: 0.8012514710426331
features_1[4] is similar to features_2[0].
features_1[4] and features_2[1]: 0.732859194278717
features_1[4] and featu

In [42]:
threshold = 0.73

max_similarity_pairs = []

for i in range(len(features_1)):
    max_similarity = -1  # Điểm tương đồng lớn nhất
    max_index = -1  # Chỉ số của features_2 có điểm tương đồng lớn nhất

    for j in range(len(features_2)):
        similarity = cosine_similarity(features_1[i], features_2[j])

        if similarity > max_similarity:
            max_similarity = similarity
            max_index = j

    if max_similarity > threshold:
        max_similarity_pairs.append((i, max_index, max_similarity))

# In kết quả
for pair in max_similarity_pairs:
    print(f"features_1[{pair[0]}] is similar to features_2[{pair[1]}] with similarity: {pair[2]}")

features_1[1] is similar to features_2[0] with similarity: [[    0.74805]]
features_1[4] is similar to features_2[0] with similarity: [[    0.80125]]
