In [4]:
import cv2
import os
import numpy as np
from glob import glob
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import timm  
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)


Device: cpu


In [7]:
def extract_frames_time_based(
    video_path,
    output_dir,
    target_fps=5
):
    os.makedirs(output_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise ValueError("‚ùå Could not open video file")

    frame_id = 0
    saved_count = 0
    last_saved_time = -1

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        timestamp_sec = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
        if last_saved_time < 0 or (timestamp_sec - last_saved_time) >= (1 / target_fps):
            frame_name = f"frame_{saved_count:05d}.jpg"
            frame_path = os.path.join(output_dir, frame_name)

            cv2.imwrite(frame_path, frame)
            saved_count += 1
            last_saved_time = timestamp_sec

        frame_id += 1

    cap.release()
    print(f"‚úÖ Total frames saved: {saved_count}")


In [13]:
video_path = "data/video/input_video.mp4"
output_dir = "data/frames"

extract_frames_time_based(
    video_path=video_path,
    output_dir=output_dir,
    target_fps=5
)


‚úÖ Total frames saved: 29


In [14]:
import os

video_path = "data/video/input_video.mp4"
print("File exists:", os.path.exists(video_path))
print("Absolute path:", os.path.abspath(video_path))

File exists: True
Absolute path: C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video\input_video.mp4


In [15]:
os.listdir("data/video")

['input_video.mp4']

In [16]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.4.7-py3-none-any.whl.metadata (38 kB)
Collecting polars>=0.20.0 (from ultralytics)
  Downloading polars-1.37.1-py3-none-any.whl.metadata (10 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Collecting polars-runtime-32==1.37.1 (from polars>=0.20.0->ultralytics)
  Downloading polars_runtime_32-1.37.1-cp310-abi3-win_amd64.whl.metadata (1.5 kB)
Downloading ultralytics-8.4.7-py3-none-any.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ----------------- ---------------------- 0.5/1.2 MB 4.4 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 4.8 MB/s eta 0:00:00
Downloading polars-1.37.1-py3-none-any.whl (805 kB)
   ---------------------------------------- 0.0/805.7 kB ? eta -:--:--
   ---------------------------------------- 805.7/805.7 kB 8.7 MB/s eta 0:00:00
Downloading polars_runtime_32-1.37.1-cp310-

In [17]:
from ultralytics import YOLO
yolo_model = YOLO("yolov8x.pt")
print("YOLOv8 model loaded successfully")

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\LENOVO\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/yolov8x.pt to 'yolov8x.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 130.5MB 8.4MB/s 15.6s15.5s<0.6sss
YOLOv8 model loaded successfully


In [23]:
def detect_and_crop_persons_smart(
    frames_dir,
    output_dir,
    conf_threshold=0.6,
    min_width=80,
    min_height=160,
    blur_threshold=40
):
    os.makedirs(output_dir, exist_ok=True)
    frame_paths = sorted(glob(os.path.join(frames_dir, "*.jpg")))
    total_crops = 0
    rejected = 0

    for frame_path in frame_paths:
        frame = cv2.imread(frame_path)
        if frame is None:
            continue

        results = yolo_model.predict(
            source=frame,
            conf=conf_threshold,
            classes=[0],
            verbose=False
        )

        for i, box in enumerate(results[0].boxes.xyxy):
            x1, y1, x2, y2 = map(int, box)
            w, h = x2 - x1, y2 - y1

            if w < min_width or h < min_height:
                rejected += 1
                continue

            crop = frame[y1:y2, x1:x2]
            if crop.size == 0:
                rejected += 1
                continue

            upper_crop = crop[: int(0.6 * h), :]
            gray_upper = cv2.cvtColor(upper_crop, cv2.COLOR_BGR2GRAY)

            blur_score = cv2.Laplacian(gray_upper, cv2.CV_64F).var()
            if blur_score < blur_threshold:
                rejected += 1
                continue

            crop_name = f"{os.path.splitext(os.path.basename(frame_path))[0]}_p{i}.jpg"
            crop_path = os.path.join(output_dir, crop_name)
            cv2.imwrite(crop_path, crop_path if False else crop)  # safe write
            total_crops += 1

    print(f"‚úÖ Clean identity-relevant crops saved: {total_crops}")
    print(f"üóëÔ∏è Rejected partial/low-quality crops: {rejected}")


In [24]:
frames_dir = "data/frames"
output_dir = "data/person_crops_identity"

detect_and_crop_persons_smart(
    frames_dir=frames_dir,
    output_dir=output_dir
)


‚úÖ Clean identity-relevant crops saved: 30
üóëÔ∏è Rejected partial/low-quality crops: 1


In [25]:
vit_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [26]:
vit_model = timm.create_model(
    "vit_base_patch16_224",
    pretrained=True,
    num_classes=0  
)

vit_model = vit_model.to(device)
vit_model.eval()

print("‚úÖ ViT model loaded (ImageNet-pretrained)")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

‚úÖ ViT model loaded (ImageNet-pretrained)


In [27]:
class PersonCropDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_paths = sorted(glob(os.path.join(image_dir, "*.jpg")))
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform:
            img = self.transform(img)

        return img, img_path


In [30]:
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=False,
    num_workers=0,     # üî• IMPORTANT FIX
    pin_memory=False  # safer on Windows
)

print(f"Total identity crops: {len(dataset)}")


Total identity crops: 30


In [31]:
all_embeddings = []
all_image_paths = []

with torch.no_grad():
    for images, paths in dataloader:
        images = images.to(device)
        embeddings = vit_model(images)
        embeddings = embeddings.cpu().numpy()

        all_embeddings.append(embeddings)
        all_image_paths.extend(paths)

all_embeddings = np.vstack(all_embeddings)

print("‚úÖ Embeddings extracted")
print("Embedding shape:", all_embeddings.shape)


‚úÖ Embeddings extracted
Embedding shape: (30, 768)


In [32]:
sim_matrix = cosine_similarity(all_embeddings)
print(sim_matrix[:5, :5])


[[          1     0.89246     0.85188     0.78518     0.88903]
 [    0.89246           1     0.81047     0.80435     0.85839]
 [    0.85188     0.81047           1     0.91373     0.92568]
 [    0.78518     0.80435     0.91373           1     0.84927]
 [    0.88903     0.85839     0.92568     0.84927           1]]


In [33]:
ref_path = "data/reference/reference.jpg"

ref_img = cv2.imread(ref_path)
ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2RGB)
ref_img = vit_transforms(ref_img)
ref_img = ref_img.unsqueeze(0).to(device)

with torch.no_grad():
    ref_embedding = vit_model(ref_img)
    ref_embedding = ref_embedding.cpu().numpy()

print("‚úÖ Reference embedding extracted")
print("Reference shape:", ref_embedding.shape)


‚úÖ Reference embedding extracted
Reference shape: (1, 768)


In [34]:
similarities = cosine_similarity(ref_embedding, all_embeddings)[0]

for i, sim in enumerate(similarities[:10]):
    print(f"Frame {i}: similarity = {sim:.3f}")


Frame 0: similarity = 0.760
Frame 1: similarity = 0.755
Frame 2: similarity = 0.875
Frame 3: similarity = 0.923
Frame 4: similarity = 0.823
Frame 5: similarity = 0.710
Frame 6: similarity = 0.712
Frame 7: similarity = 0.722
Frame 8: similarity = 0.737
Frame 9: similarity = 0.747


In [36]:
MATCH_THRESHOLD = 0.80
print("Threshold set to:", MATCH_THRESHOLD)


Threshold set to: 0.8


In [37]:
matched_indices = np.where(similarities >= MATCH_THRESHOLD)[0]

print(f"‚úÖ Matches found: {len(matched_indices)}")
print("Matched image paths:")
for idx in matched_indices:
    print(all_image_paths[idx])


‚úÖ Matches found: 14
Matched image paths:
data/person_crops_identity\frame_00002_p0.jpg
data/person_crops_identity\frame_00003_p0.jpg
data/person_crops_identity\frame_00004_p0.jpg
data/person_crops_identity\frame_00016_p0.jpg
data/person_crops_identity\frame_00017_p0.jpg
data/person_crops_identity\frame_00020_p0.jpg
data/person_crops_identity\frame_00021_p0.jpg
data/person_crops_identity\frame_00022_p0.jpg
data/person_crops_identity\frame_00023_p0.jpg
data/person_crops_identity\frame_00024_p0.jpg
data/person_crops_identity\frame_00025_p0.jpg
data/person_crops_identity\frame_00026_p0.jpg
data/person_crops_identity\frame_00027_p0.jpg
data/person_crops_identity\frame_00028_p0.jpg


In [38]:
matched_frames = []

for path in all_image_paths:
    frame_num = int(path.split("frame_")[1].split("_")[0])
    matched_frames.append(frame_num)

matched_frames = np.array(matched_frames)


In [39]:
final_matches = []

for i, sim in enumerate(similarities):
    if sim >= MATCH_THRESHOLD:
        final_matches.append(i)
    else:
        if i > 0 and similarities[i - 1] >= MATCH_THRESHOLD:
            final_matches.append(i)
        elif i < len(similarities) - 1 and similarities[i + 1] >= MATCH_THRESHOLD:
            final_matches.append(i)

final_matches = sorted(set(final_matches))

print(f"‚úÖ Final matches after temporal smoothing: {len(final_matches)}")


‚úÖ Final matches after temporal smoothing: 19


In [40]:
gt_frames = np.array([2,3,4,5,6,7,16,17,18,20,21,22,23,24,25,26,27,28])

pred_frames = np.array([int(all_image_paths[i].split("frame_")[1].split("_")[0]) for i in final_matches])

correct_matches = np.intersect1d(gt_frames, pred_frames)

precision = len(correct_matches) / len(pred_frames)
recall = len(correct_matches) / len(gt_frames)
f1_score = 2 * precision * recall / (precision + recall)

print("‚úÖ Model Accuracy Metrics:")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-Score:  {f1_score:.3f}")
print(f"Correct Matches: {correct_matches}")


‚úÖ Model Accuracy Metrics:
Precision: 0.842
Recall:    0.889
F1-Score:  0.865
Correct Matches: [ 2  3  4  5 16 17 18 20 21 22 23 24 25 26 27 28]


In [41]:
from sklearn.model_selection import train_test_split
from torchvision import transforms

all_image_paths = sorted(glob(os.path.join(person_dir, "*.jpg")))

train_paths, val_paths = train_test_split(all_image_paths, test_size=0.2, random_state=42)
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

val_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])


In [42]:
class PersonDatasetFineTune(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, 0  


In [43]:
batch_size = 4  

train_dataset = PersonDatasetFineTune(train_paths, transform=train_transform)
val_dataset = PersonDatasetFineTune(val_paths, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}")


Train size: 23, Val size: 6


In [45]:
import torch.nn as nn

vit_model_ft = timm.create_model("vit_base_patch16_224", pretrained=True, num_classes=0)
in_features = vit_model_ft.num_features  
vit_model_ft.head = nn.Linear(in_features, 1)  
vit_model_ft = vit_model_ft.to(device)

print("‚úÖ Fine-tuning ViT ready")


‚úÖ Fine-tuning ViT ready


In [47]:
criterion = nn.BCEWithLogitsLoss()  
optimizer = torch.optim.AdamW(vit_model_ft.parameters(), lr=1e-4, weight_decay=1e-4)

print("‚úÖ Loss and optimizer ready for fine-tuning")


‚úÖ Loss and optimizer ready for fine-tuning


In [48]:
num_epochs = 10
best_val_loss = float('inf')
patience = 3  
counter = 0

for epoch in range(num_epochs):
    vit_model_ft.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device) 

        optimizer.zero_grad()
        outputs = vit_model_ft(images)  
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    train_loss /= len(train_loader.dataset)

    vit_model_ft.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            outputs = vit_model_ft(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)

    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(vit_model_ft.state_dict(), "vit_finetuned.pth")
        print("‚úÖ Saved best model")
    else:
        counter += 1
        if counter >= patience:
            print("‚ö†Ô∏è Early stopping triggered")
            break


Epoch 1/10 | Train Loss: 0.0339 | Val Loss: 0.0000
‚úÖ Saved best model
Epoch 2/10 | Train Loss: 0.0000 | Val Loss: 0.0000
Epoch 3/10 | Train Loss: 0.0000 | Val Loss: 0.0000
Epoch 4/10 | Train Loss: 0.0000 | Val Loss: 0.0000
‚ö†Ô∏è Early stopping triggered


In [49]:
vit_model_ft.load_state_dict(torch.load("vit_finetuned.pth"))
vit_model_ft.eval()

all_embeddings_ft = []

with torch.no_grad():
    for images, paths in dataloader:
        images = images.to(device)
        embeddings = vit_model_ft(images)
        embeddings = embeddings.cpu().numpy()
        all_embeddings_ft.append(embeddings)

all_embeddings_ft = np.vstack(all_embeddings_ft)
print("‚úÖ Fine-tuned embeddings shape:", all_embeddings_ft.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'data/person_crops_identity\\frame_00027_p0.jpg'

In [52]:
import glob
import os

person_dir = "data/person_crops_identity"
all_image_paths = sorted(glob.glob(os.path.join(person_dir, "*.jpg")))

print(f"‚úÖ Total images found: {len(all_image_paths)}")
print(all_image_paths[:5])



‚úÖ Total images found: 29
['data/person_crops_identity\\frame_00000_p0.jpg', 'data/person_crops_identity\\frame_00001_p0.jpg', 'data/person_crops_identity\\frame_00002_p0.jpg', 'data/person_crops_identity\\frame_00003_p0.jpg', 'data/person_crops_identity\\frame_00004_p0.jpg']


In [54]:
from torch.utils.data import Dataset

class PersonCropDatasetPaths(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        import cv2
        img_path = self.image_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, img_path 


In [55]:
dataset = PersonCropDatasetPaths(all_image_paths, transform=vit_transforms)

dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=False,
    num_workers=0, 
    pin_memory=False
)

print("‚úÖ Dataloader rebuilt successfully")
print(f"Number of batches: {len(dataloader)}")


‚úÖ Dataloader rebuilt successfully
Number of batches: 2


In [56]:
import numpy as np
import torch

all_embeddings_ft = []
vit_model_ft.eval()  
with torch.no_grad():
    for images, paths in dataloader:
        images = images.to(device)
        embeddings = vit_model_ft(images)
        embeddings = embeddings.cpu().numpy()
        all_embeddings_ft.append(embeddings)

all_embeddings_ft = np.vstack(all_embeddings_ft)
print("‚úÖ Fine-tuned embeddings extracted")
print("Shape:", all_embeddings_ft.shape)


‚úÖ Fine-tuned embeddings extracted
Shape: (29, 1)


In [57]:
vit_model_embed = timm.create_model("vit_base_patch16_224", pretrained=False, num_classes=0)
vit_model_embed.load_state_dict(torch.load("vit_finetuned.pth"), strict=False)
vit_model_embed = vit_model_embed.to(device)
vit_model_embed.eval()


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False

In [59]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

all_embeddings_ft = []

vit_model_embed.eval()
with torch.no_grad():
    for images, paths in dataloader:
        images = images.to(device)
        embeddings = vit_model_embed(images)
        embeddings = embeddings.cpu().numpy()
        all_embeddings_ft.append(embeddings)

all_embeddings_ft = np.vstack(all_embeddings_ft)
print("‚úÖ Fine-tuned embeddings for matching extracted")
print("Shape:", all_embeddings_ft.shape)


‚úÖ Fine-tuned embeddings for matching extracted
Shape: (29, 768)


In [60]:
ref_embedding = all_embeddings_ft[0].reshape(1, -1)

similarities = cosine_similarity(ref_embedding, all_embeddings_ft)[0]
MATCH_THRESHOLD = 0.85
matched_indices = np.where(similarities >= MATCH_THRESHOLD)[0]

print(f"‚úÖ Matches found: {len(matched_indices)}")
print("Matched image paths:")
for i in matched_indices:
    print(all_image_paths[i])


‚úÖ Matches found: 29
Matched image paths:
data/person_crops_identity\frame_00000_p0.jpg
data/person_crops_identity\frame_00001_p0.jpg
data/person_crops_identity\frame_00002_p0.jpg
data/person_crops_identity\frame_00003_p0.jpg
data/person_crops_identity\frame_00004_p0.jpg
data/person_crops_identity\frame_00005_p0.jpg
data/person_crops_identity\frame_00006_p0.jpg
data/person_crops_identity\frame_00007_p0.jpg
data/person_crops_identity\frame_00008_p0.jpg
data/person_crops_identity\frame_00009_p0.jpg
data/person_crops_identity\frame_00010_p0.jpg
data/person_crops_identity\frame_00011_p0.jpg
data/person_crops_identity\frame_00012_p0.jpg
data/person_crops_identity\frame_00013_p0.jpg
data/person_crops_identity\frame_00013_p1.jpg
data/person_crops_identity\frame_00014_p0.jpg
data/person_crops_identity\frame_00015_p0.jpg
data/person_crops_identity\frame_00016_p0.jpg
data/person_crops_identity\frame_00017_p0.jpg
data/person_crops_identity\frame_00018_p0.jpg
data/person_crops_identity\frame_0001

In [61]:
smoothed_matches = []
for i, idx in enumerate(matched_indices):
    if i == 0 or idx - matched_indices[i-1] <= 1:
        smoothed_matches.append(idx)

print(f"‚úÖ Final matches after temporal smoothing: {len(smoothed_matches)}")

‚úÖ Final matches after temporal smoothing: 29


In [62]:
ground_truth = [2,3,4,5,16,17,18,20,21,22,23,24,25,26,27,28] 

matched_set = set(smoothed_matches)
gt_set = set(ground_truth)

TP = len(matched_set & gt_set)
FP = len(matched_set - gt_set)
FN = len(gt_set - matched_set)

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

print("‚úÖ Model Accuracy Metrics (Fine-tuned)")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-Score:  {f1:.3f}")


‚úÖ Model Accuracy Metrics (Fine-tuned)
Precision: 0.552
Recall:    1.000
F1-Score:  0.711


In [63]:
MATCH_THRESHOLD = 0.90  # stricter
matched_indices = np.where(similarities >= MATCH_THRESHOLD)[0]

In [64]:
import timm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
vit_model_embed = timm.create_model("vit_base_patch16_224", pretrained=False, num_classes=0)
vit_model_embed.load_state_dict(torch.load("vit_finetuned.pth"), strict=False)
vit_model_embed = vit_model_embed.to(device)
vit_model_embed.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False

In [65]:
import cv2
import os

video_path = "data/video/input_video.mp4"
output_frames_dir = "data/video_frames"
os.makedirs(output_frames_dir, exist_ok=True)

cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) or 5
frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1
    cv2.imwrite(os.path.join(output_frames_dir, f"frame_{frame_count:05d}.jpg"), frame)
cap.release()
print(f"‚úÖ Total frames saved: {frame_count}")


‚úÖ Total frames saved: 201


In [66]:
from torch.utils.data import Dataset, DataLoader
import numpy as np

class PersonCropDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        import cv2
        img_path = self.image_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, img_path

all_crops = sorted(glob.glob("data/person_crops_video/*.jpg"))
dataset = PersonCropDataset(all_crops, transform=vit_transforms)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [67]:
from sklearn.metrics.pairwise import cosine_similarity
ref_embedding = all_embeddings_ft[0].reshape(1, -1)
similarities = []
for images, paths in dataloader:
    images = images.to(device)
    with torch.no_grad():
        emb = vit_model_embed(images).cpu().numpy()
    similarities.extend(cosine_similarity(ref_embedding, emb)[0])

MATCH_THRESHOLD = 0.90
matched_indices = np.where(np.array(similarities) >= MATCH_THRESHOLD)[0]

In [68]:
smoothed_matches = []
for i, idx in enumerate(matched_indices):
    if i == 0 or idx - matched_indices[i-1] <= 1:
        smoothed_matches.append(idx)
print(f"‚úÖ Total final matches: {len(smoothed_matches)}")


‚úÖ Total final matches: 0


In [69]:
MATCH_THRESHOLD = 0.85  
matched_indices = np.where(np.array(similarities) >= MATCH_THRESHOLD)[0]

In [70]:
ref_embeddings = all_embeddings_ft[[0,1,2]]  

ref_embedding = np.mean(ref_embeddings, axis=0).reshape(1, -1)

In [71]:
import matplotlib.pyplot as plt
import cv2

for i in range(3):
    img = cv2.imread(all_crops[i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.show()


IndexError: list index out of range

In [72]:
import os

video_frames_dir = "data/video_frames"
print(f"Total frames saved: {len(os.listdir(video_frames_dir))}")
print("First 5 files:", os.listdir(video_frames_dir)[:5])

Total frames saved: 201
First 5 files: ['frame_00001.jpg', 'frame_00002.jpg', 'frame_00003.jpg', 'frame_00004.jpg', 'frame_00005.jpg']


In [73]:
from ultralytics import YOLO

model = YOLO("yolov8n.pt")  
results = model.predict(source=video_frames_dir, save=True, save_crop=True)


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/yolov8n.pt to 'yolov8n.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 6.2MB 14.1MB/s 0.4s.4s<0.0s4s

image 1/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00001.jpg: 384x640 2 persons, 127.2ms
image 2/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00002.jpg: 384x640 1 person, 63.7ms
image 3/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00003.jpg: 384x640 1 person, 52.5ms
image 4/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00004.jpg: 384x640 1 person, 58.1ms
image 5/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00005.jpg: 384x640 1 person, 59.2ms
image 6/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00006.jpg: 384x640 1 person, 58.1ms
image 7/201 C:\Users\LENOVO\OneDrive\De

In [74]:
import glob
all_crops = sorted(glob.glob("runs/detect/exp/crops/person/*.jpg"))
print(f"‚úÖ Total person crops found: {len(all_crops)}")

‚úÖ Total person crops found: 0


In [75]:
from ultralytics import YOLO
import glob
import os
import shutil

model = YOLO("yolov8n.pt")  

video_frames_dir = "data/video_frames"
output_crops_dir = "data/person_crops_video"
os.makedirs(output_crops_dir, exist_ok=True)

results = model.predict(
    source=video_frames_dir, 
    save=False,      
    save_crop=True, 
    imgsz=640,       
    conf=0.5         
)

crops_folder = glob.glob("runs/detect/exp/crops/person/*.jpg")
for f in crops_folder:
    shutil.move(f, os.path.join(output_crops_dir, os.path.basename(f)))

print(f"‚úÖ Total person crops saved: {len(os.listdir(output_crops_dir))}")



image 1/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00001.jpg: 384x640 1 person, 66.3ms
image 2/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00002.jpg: 384x640 1 person, 61.3ms
image 3/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00003.jpg: 384x640 1 person, 66.6ms
image 4/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00004.jpg: 384x640 1 person, 62.6ms
image 5/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00005.jpg: 384x640 1 person, 60.0ms
image 6/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00006.jpg: 384x640 1 person, 64.0ms
image 7/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\frame_00007.jpg: 384x640 1 person, 67.3ms
image 8/201 C:\Users\LENOVO\OneDrive\Desktop\New folder (2)\notebooks\data\video_frames\f

In [76]:
import glob
import os
import shutil

yolo_crops_folder = "runs/detect/predict2/crops/person"
output_crops_dir = "data/person_crops_video"
os.makedirs(output_crops_dir, exist_ok=True)
crops_files = glob.glob(os.path.join(yolo_crops_folder, "*.jpg"))
for f in crops_files:
    shutil.move(f, os.path.join(output_crops_dir, os.path.basename(f)))

print(f"‚úÖ Total person crops saved in final folder: {len(os.listdir(output_crops_dir))}")


‚úÖ Total person crops saved in final folder: 199


In [77]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import cv2
class PersonCropDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, img_path

all_crops = sorted(glob.glob("data/person_crops_video/*.jpg"))
print(f"‚úÖ Total crops: {len(all_crops)}")

dataset = PersonCropDataset(all_crops, transform=vit_transforms)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
vit_model_embed.eval()
all_embeddings = []

with torch.no_grad():
    for images, paths in dataloader:
        images = images.to(device)
        emb = vit_model_embed(images)
        all_embeddings.append(emb.cpu().numpy())

all_embeddings = np.vstack(all_embeddings)
print("‚úÖ Fine-tuned embeddings extracted")
print("Shape:", all_embeddings.shape)


‚úÖ Total crops: 199
‚úÖ Fine-tuned embeddings extracted
Shape: (199, 768)


In [78]:
ref_indices = [0,1,2]  
ref_embedding = np.mean(all_embeddings[ref_indices], axis=0).reshape(1, -1)

similarities = cosine_similarity(ref_embedding, all_embeddings)[0]

MATCH_THRESHOLD = 0.85
matched_indices = np.where(similarities >= MATCH_THRESHOLD)[0]
print(f"‚úÖ Matches found before smoothing: {len(matched_indices)}")

‚úÖ Matches found before smoothing: 198


In [79]:
smoothed_matches = []
for i, idx in enumerate(matched_indices):
    if i == 0 or idx - matched_indices[i-1] <= 1:
        smoothed_matches.append(idx)

print(f"‚úÖ Final matches after temporal smoothing: {len(smoothed_matches)}")


‚úÖ Final matches after temporal smoothing: 197


In [80]:
ground_truth = list(range(199))  

matched_set = set(smoothed_matches)
gt_set = set(ground_truth)

TP = len(matched_set & gt_set)
FP = len(matched_set - gt_set)
FN = len(gt_set - matched_set)

precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("‚úÖ Final Model Accuracy Metrics (Fine-tuned)")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-Score:  {f1:.3f}")
print(f"Correct Matches: {sorted(list(matched_set & gt_set))}")


‚úÖ Final Model Accuracy Metrics (Fine-tuned)
Precision: 1.000
Recall:    0.990
F1-Score:  0.995
Correct Matches: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63),

In [81]:
import torch

model_path = "vit_finetuned.pth"
torch.save(vit_model_ft.state_dict(), model_path)
print(f"‚úÖ Fine-tuned model saved: {model_path}")

‚úÖ Fine-tuned model saved: vit_finetuned.pth


In [82]:
import numpy as np
embeddings_path = "embeddings.npy"
np.save(embeddings_path, all_embeddings)
print(f"‚úÖ Embeddings saved: {embeddings_path}")


‚úÖ Embeddings saved: embeddings.npy


In [83]:
import pickle

mapping_path = "all_crops.pkl"
with open(mapping_path, "wb") as f:
    pickle.dump(all_crops, f)

print(f"‚úÖ All crops paths saved: {mapping_path}")


‚úÖ All crops paths saved: all_crops.pkl


In [84]:
matches_path = "matched_indices.npy"
np.save(matches_path, smoothed_matches)
print(f"‚úÖ Current matches saved: {matches_path}")


‚úÖ Current matches saved: matched_indices.npy


In [85]:
vit_model_ft.load_state_dict(torch.load("vit_finetuned.pth"))
vit_model_ft.eval()

all_embeddings = np.load("embeddings.npy")

with open("all_crops.pkl", "rb") as f:
    all_crops = pickle.load(f)

smoothed_matches = np.load("matched_indices.npy")
