# Criando um detector de liguagem de sinais com python, usando as bibliotecas opencv, mediapipe e scikit-learn

In [None]:
# Criar o banco de dados

In [None]:
# Importando
 
import os
import sys
import cv2
import pickle
import mediapipe as mp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
# Definindo local das imagens dos dados

DATA_DIR = './data'
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
# Realizando a captura da tela (Aqui salvamos as imagens no data, que será a data que usaremos para treinar)

number_of_classes = 3
dataset_size = 100

cap = cv2.VideoCapture(0)

if not cap.isOpened():
    sys.exit(1)


def sair_programa():
    cap.release()
    cv2.destroyAllWindows()
    sys.exit(0)


try:
    for j in range(number_of_classes):
        os.makedirs(os.path.join(DATA_DIR, str(j)), exist_ok=True)

        while True:
            ret, frame = cap.read()
            if not ret:
                print("Falha ao capturar frame. Encerrando.")
                sair_programa()

            cv2.putText(frame, 'Pressione "q" para iniciar, ESC para sair', (30, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
            cv2.imshow('frame', frame)

            key = cv2.waitKey(25) & 0xFF

            if key == ord('q'):
                break
            elif key == 27:
                sair_programa()

        counter = 0
        while counter < dataset_size:
            ret, frame = cap.read()
            if not ret:
                print("Falha ao capturar frame durante a coleta. Encerrando.")
                sair_programa()

            cv2.putText(frame, f'Classe {j} - {counter+1}/{dataset_size} (ESC para sair)', (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
            cv2.imshow('frame', frame)

            key = cv2.waitKey(25) & 0xFF
            if key == 27:
                sair_programa()

            cv2.imwrite(os.path.join(DATA_DIR, str(j), f'{counter}.jpg'), frame)
            counter += 1


finally:
    if cap.isOpened():
        cap.release()
    cv2.destroyAllWindows()

In [None]:
# Usando os dados salvos das imagens capturadas para criar o dataset 
# Iremos trabalhar pegando os dados dos landmarks da imagem e classificando ela

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

data = []
labels = []

for dir_ in os.listdir(DATA_DIR): 
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux = []

        img = cv2.imread(os.path.join(DATA_DIR, dir_, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = hands.process(img_rgb)
        
        if (results.multi_hand_landmarks):
            for hand_landmarks in results.multi_hand_landmarks:
                # Mostrando as imagens apenas
                #for hand_landmarks in results.multi_hand_landmarks:
                #    mp_drawing.draw_landmarks(
                #        img_rgb,
                #        hand_landmarks,
                #        mp_hands.HAND_CONNECTIONS,
                #        mp_drawing_styles.get_default_hand_landmarks_style(),
                #        mp_drawing_styles.get_default_hand_connections_style()
                #    )

                for i in range(len(hand_landmarks.landmark)):
                    print(hand_landmarks.landmark[i])

                    # x = posição horizontal (da esquerda pra direita)
                    # y = posição vertical (de cima pra baixo)
                    # z = profundidade relativa (distância “para dentro” da tela)
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y

                    data_aux.append(x)
                    data_aux.append(y)

                data.append(data_aux)
                labels.append(dir_)


        # plt.figure()    
        # plt.imshow(img_rgb)

# plt.show()


f = open('data.pickle', 'wb')
pickle.dump({'data': data, 'labels': labels}, f)
f.close()

In [None]:
# Treinando os dados
# Iremos usar o modelo random forest para treinar

data_dict = pickle.load(open('./data.pickle', 'rb'))

# print(data_dict.keys())
# print(data_dict)
 
data = np.asarray(data_dict['data'])
labels = np.asarray(data_dict['labels'])

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, shuffle=True, stratify=labels)

model = RandomForestClassifier()

model.fit(x_train, y_train)

y_predict = model.predict(x_test)

score = accuracy_score(y_predict, y_test)

print("Resultado do treinamento:", score * 100)

f = open('model.p', 'wb')
pickle.dump({'model': model}, f)
f.close()

In [None]:
# Testando o modelo

model_dict = pickle.load(open('./model.p', 'rb'))
model = model_dict['model']

cap = cv2.VideoCapture(0)

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

labels_dict = {0: 'A', 1: 'B', 2: 'L'}

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    H, W, _ = frame.shape

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )

        x_, y_, data_aux = [], [], []
        for hand_landmarks in results.multi_hand_landmarks:
            for lm in hand_landmarks.landmark:
                x_.append(lm.x)
                y_.append(lm.y)
                data_aux.append(lm.x)
                data_aux.append(lm.y)

        x1 = int(min(x_) * W) - 10
        y1 = int(min(y_) * H) - 10
        x2 = int(max(x_) * W) + 10
        y2 = int(max(y_) * H) + 10

        prediction = model.predict([np.asarray(data_aux)])
        predicted_character = labels_dict[int(prediction[0])]
        print(predicted_character)

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
        cv2.putText(frame, predicted_character, (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3, cv2.LINE_AA)

    cv2.imshow('frame', frame)

    key = cv2.waitKey(1) & 0xFF
    if key == 27:
        print("ESC pressionado. Encerrando...")
        break

cap.release()
cv2.destroyAllWindows()