In [1]:
import cv2 as cv
import sys
import os
import mediapipe as mp
from transformers import ViTForImageClassification, ViTFeatureExtractor
from PIL import Image
import torch
from torchvision import transforms
import numpy as np
from pathlib import Path
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "../model/checkpoint-1291"
model = ViTForImageClassification.from_pretrained(model_path)
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')



In [3]:
model.eval()

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [4]:
classes = [label for label in os.listdir(Path("..\\datasets\ASL\ASL_train"))]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [5]:
def return_prob_clas(cv2_image):

    def get_image(cv2_img):
        preprocess = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        ])
        image = Image.fromarray(cv2_img)
        image = preprocess(image)
        image = image.unsqueeze(0)
        return image
    image = get_image(cv2_image)
    image =image.to(device)
    with torch.no_grad():
        outputs = model(image)
    probabilities= F.softmax(outputs.logits,dim=1)
    predicted_class = probabilities.argmax(-1).item()
    prediction = probabilities[0,predicted_class].item()
    return classes[predicted_class],prediction


In [6]:
def crop(frame,y_min,y_max,x_min,x_max):
    return frame[ y_min - 35:y_max + 35,x_min - 35:x_max + 35]

In [7]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

#Capturar manos
hands = mp_hands.Hands()

In [8]:
s= 0
if len(sys.argv)> 1:
    s= sys.argv[1]
print(s)
source= cv.VideoCapture(0)

win_name= 'Camera Preview'
cv.namedWindow(win_name, cv.WINDOW_NORMAL)

is_capturing = False
#Enlazar manos al landmark
while cv.waitKey(1)!= 27:# Escape
    has_frame, frame= source.read()
    frame = cv.flip(frame,1)
    if not has_frame:
        break
    results = hands.process(frame)
    if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                x_min = y_min = float('inf')
                x_max = y_max = float('-inf')
            
                for lm in hand_landmarks.landmark:
                    x, y = int(lm.x * frame.shape[1]), int(lm.y * frame.shape[0])
                    x_min = min(x_min, x)
                    y_min = min(y_min, y)
                    x_max = max(x_max, x)
                    y_max = max(y_max, y)
                # Dibujar bbox de la mano
                cv.rectangle(frame, (x_min - 35, y_min - 35), (x_max + 35, y_max + 35), (255, 255, 0), 2)
                #Mostrar prediccion cuando se esta capturando
                if is_capturing:
                    try:
                        cropped_frame = crop(frame=frame,y_min=y_min,y_max=y_max,x_min=x_min,x_max=x_max)
                        label,prediction = return_prob_clas(cropped_frame)
                        #Aqui se capturara el frame que el transformer analizara
                        cv.putText(frame,f"Letra: {label[0]}  Lengua de Signos: {label[2:]} Probabilidad: {prediction}",(x_min - 35, y_min - 35),cv.FONT_HERSHEY_SIMPLEX,0.5,(255,0,0),2)
                    except Exception as e:
                        print(e)
                        cv.putText(frame,f"Error al procesar",(x_min - 35, y_min - 35),cv.FONT_HERSHEY_SIMPLEX,0.5,(255,0,0),2)
    if  not has_frame:
       break
    if is_capturing:
        cv.putText(frame,'Capturando...',(50,50),cv.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)
    else:
        cv.putText(frame,f'Modelo en uso: {model_path[8:]}',(50,50),cv.FONT_HERSHEY_SIMPLEX,0.6,(255, 0, 0), 2)
    key = cv.waitKey(1) & 0xFF
    if key == ord('c'):
        is_capturing = True
    elif key == ord('v'):
        is_capturing = False
    cv.imshow(win_name, frame)
source.release()
cv.destroyWindow(win_name)

--f=c:\Users\aaron\AppData\Roaming\jupyter\runtime\kernel-v2-28164E5cu2yXcoVzY.json


  context_layer = torch.nn.functional.scaled_dot_product_attention(
