In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import time
import timm

In [54]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 53 * 53, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 12)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [55]:
class MyLitModel(nn.Module):
    def __init__(self):
        super(MyLitModel, self).__init__()
        self.model = Net()
    def forward(self, x):
        return self.model(x)

In [62]:
class LitEva02(nn.Module):
    def __init__(self, num_classes=12, lr=0.001):
        super(LitEva02, self).__init__()
        self.model = timm.create_model('eva02_tiny_patch14_336.mim_in22k_ft_in1k', pretrained=True, num_classes=num_classes)

    def forward(self, x):
        return self.model(x)

In [63]:
class LitEffnet(nn.Module):
    def __init__(self, num_classes=12, lr=0.001):
        super(LitEffnet, self).__init__()
        self.model = timm.create_model('efficientnet_b2.ra_in1k', pretrained=True, num_classes=num_classes)
    def forward(self, x):
        return self.model(x)

In [64]:
model = LitEva02()
checkpoint = torch.load('D:\Chord_Data\Chords\\chord-eva02_no-sha-cnn-epoch=13-val_loss=0.00.ckpt', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])
model.eval()

LitEva02(
  (model): Eva(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 192, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (rope): RotaryEmbeddingCat()
    (blocks): ModuleList(
      (0): EvaBlock(
        (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (attn): EvaAttention(
          (qkv): Linear(in_features=192, out_features=576, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (norm): Identity()
          (proj): Linear(in_features=192, out_features=192, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path1): Identity()
        (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (mlp): GluMlp(
          (fc1): Linear(in_features=192, out_features=1024, bias=True)
          (act): SiLU()
          (drop1): Dropout(p=0.0, inplace=False)
          (norm): Identity()
          (fc2): Linear(in_fea

In [35]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_pretrained = transforms.Compose([
    transforms.Resize((336, 336)),
    transforms.ToTensor(),
    transforms.Normalize(mean=torch.tensor([0.4815, 0.4578, 0.4082]), std=torch.tensor([0.2686, 0.2613, 0.2758]))
])
def preprocess_single_image(image_path, transform):
    input_image = Image.open(image_path).convert('RGB')
    input_tensor = transform(input_image)
    input_batch = input_tensor.unsqueeze(0)
    return input_batch

In [65]:
import os
image_folder = r'D:\Chord_Data\Chords\Img_no_sha\1\\'
image_files = os.listdir(image_folder)
image_files.sort()
total_time = 0
num_images = 100
for i in range(num_images):
    image_path = os.path.join(image_folder, image_files[i])
    input_batch = preprocess_single_image(image_path, transform_pretrained)
    with torch.no_grad():
        start_time = time.time()
        output = model(input_batch)
        end_time = time.time()
        elapsed_time = (end_time - start_time) * 1000
        total_time += elapsed_time
        #print(f"Inference Time for image {i + 1}: {elapsed_time:.6f} milliseconds")

# Calculate average runtime
average_time = total_time / num_images
print(f"Average Inference Time: {average_time:.6f} milliseconds")

Average Inference Time: 354.181051 milliseconds


In [19]:
import mediapipe as mp
import numpy as np
import joblib
from google.protobuf.json_format import MessageToDict
import sklearn
import os
import pandas as pd

In [4]:
rf_model = joblib.load('D:\Chord_Data\Angle_Open\\ang_open_random_forest_1.pkl')

In [5]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

In [27]:
import cv2
cap = cv2.VideoCapture(0)

In [29]:
total_time = 0
num_frames = 100
frame_count = 0

while frame_count < num_frames:
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture frame")
        break
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = MessageToDict(hand_landmarks)["landmark"]
            decoded_landmarks = []
            for l in landmarks:
                x, y, z = l['x'] * frame.shape[1] * 3/4, l['y'] * frame.shape[0], l['z']
                decoded_landmarks.append({'x': x, 'y': y, 'z': z})
            x0, y0, z0 = decoded_landmarks[0]['x'], decoded_landmarks[0]['y'], decoded_landmarks[0]['z']
            for landmark in decoded_landmarks:
                landmark['x'] -= x0
                landmark['y'] -= y0
                landmark['z'] -= z0

            x5, x17 = decoded_landmarks[5]['x'], decoded_landmarks[17]['x']
            scale_factor = x17 - x5

            for landmark in decoded_landmarks:
                landmark['x'] /= scale_factor
                landmark['y'] /= scale_factor
                landmark['z'] /= scale_factor

            index_finger_landmarks = [decoded_landmarks[i] for i in [0, 1, 5, 17]]
            features = []
            for j, landmark in enumerate(index_finger_landmarks):
                features.extend([landmark['x'], landmark['y'], landmark['z']])
            feature_names = [f'Landmark_{1}_x', f'Landmark_{2}_x', f'Landmark_{3}_x', f'Landmark_{4}_x', f'Landmark_{1}_y', f'Landmark_{2}_y', f'Landmark_{3}_y', f'Landmark_{4}_y', f'Landmark_{1}_z', f'Landmark_{2}_z', f'Landmark_{3}_z', f'Landmark_{4}_z']
            feature_df = pd.DataFrame([features], columns=feature_names)

            # Perform inference
            with torch.no_grad():
                start_time = time.time()
                xgb_output = rf_model.predict(feature_df)
                end_time = time.time()
                elapsed_time = (end_time - start_time) * 1000
                total_time += elapsed_time
                print(xgb_output)

            frame_count += 1
            if frame_count >= num_frames:
                break

# Release the video capture device and calculate average inference time
cap.release()
average_time = total_time / num_frames
print(f"Average Inference Time: {average_time:.6f} milliseconds")

[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
[1]
Average Inference Time: 37.953308 milliseconds
