In [1]:
# Instalación de dependencias necesarias
!pip install mediapipe opencv-python tensorflow pandas scikit-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Paso 2: Importar librerías necesarias
import cv2
import mediapipe as mp
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Paso 3: Crear el módulo de detección de manos
class handDetector():
    def __init__(self, mode=False, maxHands=2, detectionCon=0.5, trackCon=0.5):
        self.mode = mode
        self.maxHands = maxHands
        self.detectionCon = detectionCon
        self.trackCon = trackCon

        self.mpHands = mp.solutions.hands
        self.hands = self.mpHands.Hands(static_image_mode=self.mode, max_num_hands=self.maxHands,
                                        min_detection_confidence=self.detectionCon, min_tracking_confidence=self.trackCon)
        self.mpDraw = mp.solutions.drawing_utils

    def findHands(self, img, draw=True):
        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        self.results = self.hands.process(imgRGB)

        if self.results.multi_hand_landmarks:
            for handLms in self.results.multi_hand_landmarks:
                if draw:
                    self.mpDraw.draw_landmarks(img, handLms,
                                               self.mpHands.HAND_CONNECTIONS)
        return img

    def findPosition(self, img, handNo=0, draw=True):
        lmList = []
        if self.results.multi_hand_landmarks:
            myHand = self.results.multi_hand_landmarks[handNo]
            for id, lm in enumerate(myHand.landmark):
                h, w, c = img.shape
                cx, cy = int(lm.x * w), int(lm.y * h)
                lmList.append([id, cx, cy])
                if draw:
                    cv2.circle(img, (cx, cy), 5, (255, 0, 255), cv2.FILLED)
        return lmList


In [4]:
# Paso 4: Función para recopilar datos de puntos de referencia de manos
def collect_data(detector, cap, num_samples_per_label=1000, output_file='hand_landmarks_0_to_5.csv'):
    data = []
    for label in range(6):  # Etiquetas del 0 al 5
        print(f"Mostrando el número {label}")
        count = 0
        while count < num_samples_per_label:
            success, img = cap.read()
            if not success:
                continue
            img = detector.findHands(img)
            lmList = detector.findPosition(img, draw=False)
            if len(lmList) == 21:
                lm_flattened = [coord for lm in lmList for coord in lm[1:]]
                data.append(lm_flattened + [label])
                count += 1
            cv2.imshow("Image", img)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    columns = [f'x{i}' for i in range(21)] + [f'y{i}' for i in range(21)] + ['label']
    df = pd.DataFrame(data, columns=columns)
    df.to_csv(output_file, index=False)
    cap.release()
    cv2.destroyAllWindows()

In [5]:
# # Recopilar datos
# if __name__ == "__main__":
#     detector = handDetector(detectionCon=0.75)
#     cap = cv2.VideoCapture(0)
#     collect_data(detector, cap, num_samples_per_label=1000, output_file='hand_landmarks_0_to_5.csv')

In [6]:
# Paso 5: Entrenar el modelo de red neuronal
def train_model(data_file='hand_landmarks_0_to_5.csv'):
    data = pd.read_csv(data_file)
    X = data.drop(['label'], axis=1).values
    y = data['label'].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = tf.keras.utils.to_categorical(y, num_classes=6)  # 6 clases para números del 0 al 5

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(6, activation='softmax')  # 6 clases para números del 0 al 5
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

    model.save('hand_gesture_model_0_to_5.h5')
    return model, scaler


In [7]:
# Entrenar el modelo
if __name__ == "__main__":
    model, scaler = train_model(data_file='hand_landmarks_0_to_5.csv')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4228 - loss: 1.4169 - val_accuracy: 0.7283 - val_loss: 0.6112
Epoch 2/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7170 - loss: 0.7142 - val_accuracy: 0.8042 - val_loss: 0.4547
Epoch 3/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7862 - loss: 0.5740 - val_accuracy: 0.8767 - val_loss: 0.3384
Epoch 4/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8496 - loss: 0.4446 - val_accuracy: 0.9050 - val_loss: 0.2730
Epoch 5/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8644 - loss: 0.3948 - val_accuracy: 0.9150 - val_loss: 0.2478
Epoch 6/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8844 - loss: 0.3452 - val_accuracy: 0.9367 - val_loss: 0.1969
Epoch 7/50
[1m150/150[0m [32m━━━━━━━



In [8]:
# Paso 6: Inferencia en tiempo real con el modelo entrenado
def real_time_inference(detector, model, scaler):
    cap = cv2.VideoCapture(0)
    pTime = 0

    while True:
        success, img = cap.read()
        if not success:
            continue
        img = detector.findHands(img)
        lmList = detector.findPosition(img, draw=False)

        if len(lmList) == 21:
            lmArray = np.array([coord for lm in lmList for coord in lm[1:]]).reshape(1, -1)
            lmArray = scaler.transform(lmArray)

            prediction = model.predict(lmArray)
            class_id = np.argmax(prediction)
            confidence = np.max(prediction)

            cv2.putText(img, f'Gesture: {class_id} ({confidence:.2f})', (10, 70), cv2.FONT_HERSHEY_PLAIN, 2,
                        (255, 0, 0), 2)

        cTime = time.time()
        fps = 1 / (cTime - pTime)
        pTime = cTime

        cv2.putText(img, f'FPS: {int(fps)}', (10, 40), cv2.FONT_HERSHEY_PLAIN, 2, (255, 0, 0), 2)

        cv2.imshow("Image", img)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

In [None]:
# Realizar inferencia en tiempo real
detector = handDetector(detectionCon=0.75)
model = tf.keras.models.load_model('hand_gesture_model_0_to_5.h5')
real_time_inference(detector, model, scaler)