In [1]:
import mediapipe as mp
from mediapipe import tasks
from mediapipe.tasks.python import vision

import cv2

from pathlib import Path
import numpy as np


In [2]:
model_path = 'models/pose_landmarker_full.task'

In [None]:
# Create a pose landmarker instance with the video mode:
landmarkder_options = vision.PoseLandmarkerOptions(
    base_options=tasks.BaseOptions(model_asset_path=model_path),
    running_mode=vision.RunningMode.VIDEO)

In [21]:
def video2landmarks(path: str):
    """動画からポーズランドマークを取得する

    Args:
        path (str): 動画ファイルのパス

    Yields:
        vision.PoseLandmarkerResult: ポーズランドマーク
    """

    with vision.PoseLandmarker.create_from_options(landmarkder_options) as landmarker:
        cap = cv2.VideoCapture(path)
        if not cap.isOpened():
            raise Exception(path)

        while True:
            ret, cv2_image = cap.read()
            if not ret:
                break

            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2_image)
            timestamp = int(cap.get(cv2.CAP_PROP_POS_MSEC))
            pose_landmarker_result = landmarker.detect_for_video(mp_image, timestamp)

            yield pose_landmarker_result

        cap.release()

## 調査

In [23]:
pose_path = Path('videos/pose')
videos = list(pose_path.glob('*.mp4'))
videos

[PosixPath('videos/pose/vertical2.mp4'),
 PosixPath('videos/pose/a.mp4'),
 PosixPath('videos/pose/vertical1.mp4'),
 PosixPath('videos/pose/c.mp4'),
 PosixPath('videos/pose/none.mp4'),
 PosixPath('videos/pose/right.mp4'),
 PosixPath('videos/pose/bad.mp4'),
 PosixPath('videos/pose/left.mp4'),
 PosixPath('videos/pose/horizontal.mp4'),
 PosixPath('videos/pose/y.mp4'),
 PosixPath('videos/pose/ok.mp4'),
 PosixPath('videos/pose/m.mp4')]

In [5]:
videos[0].stem

'vertical2'

In [6]:
str(videos[0])

'videos/pose/vertical2.mp4'

In [7]:
p = str(videos[0])
resg = video2landmarks(p)
res = list(resg)
res[:4]

I0000 00:00:1718126250.365534 3625689 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1718126250.558947 3626955 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1718126250.584720 3626959 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[PoseLandmarkerResult(pose_landmarks=[[NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446045, z=-0.34546229243278503, visibility=0.998515784740448, presence=0.996353268623352), NormalizedLandmark(x=0.5514749884605408, y=0.2973483204841614, z=-0.3067162036895752, visibility=0.9935886859893799, presence=0.9960289001464844), NormalizedLandmark(x=0.5576618909835815, y=0.29758909344673157, z=-0.30685240030288696, visibility=0.9948229789733887, presence=0.9960538148880005), NormalizedLandmark(x=0.5638865828514099, y=0.29779940843582153, z=-0.30689817667007446, visibility=0.993442177772522, presence=0.9956287145614624), NormalizedLandmark(x=0.5339497327804565, y=0.296424001455307, z=-0.3091334402561188, visibility=0.9948652386665344, presence=0.9964937567710876), NormalizedLandmark(x=0.5283708572387695, y=0.2959950268268585, z=-0.30910125374794006, visibility=0.9954311847686768, presence=0.9963639378547668), NormalizedLandmark(x=0.5229752063751221, y=0.2955537736415863, z=-0.30912867

In [8]:
res0 = res[0]
res0

PoseLandmarkerResult(pose_landmarks=[[NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446045, z=-0.34546229243278503, visibility=0.998515784740448, presence=0.996353268623352), NormalizedLandmark(x=0.5514749884605408, y=0.2973483204841614, z=-0.3067162036895752, visibility=0.9935886859893799, presence=0.9960289001464844), NormalizedLandmark(x=0.5576618909835815, y=0.29758909344673157, z=-0.30685240030288696, visibility=0.9948229789733887, presence=0.9960538148880005), NormalizedLandmark(x=0.5638865828514099, y=0.29779940843582153, z=-0.30689817667007446, visibility=0.993442177772522, presence=0.9956287145614624), NormalizedLandmark(x=0.5339497327804565, y=0.296424001455307, z=-0.3091334402561188, visibility=0.9948652386665344, presence=0.9964937567710876), NormalizedLandmark(x=0.5283708572387695, y=0.2959950268268585, z=-0.30910125374794006, visibility=0.9954311847686768, presence=0.9963639378547668), NormalizedLandmark(x=0.5229752063751221, y=0.2955537736415863, z=-0.309128671

In [9]:
pls = res0.pose_landmarks[0]
pls[:4]

[NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446045, z=-0.34546229243278503, visibility=0.998515784740448, presence=0.996353268623352),
 NormalizedLandmark(x=0.5514749884605408, y=0.2973483204841614, z=-0.3067162036895752, visibility=0.9935886859893799, presence=0.9960289001464844),
 NormalizedLandmark(x=0.5576618909835815, y=0.29758909344673157, z=-0.30685240030288696, visibility=0.9948229789733887, presence=0.9960538148880005),
 NormalizedLandmark(x=0.5638865828514099, y=0.29779940843582153, z=-0.30689817667007446, visibility=0.993442177772522, presence=0.9956287145614624)]

In [10]:
pl = pls[0]
pl

NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446045, z=-0.34546229243278503, visibility=0.998515784740448, presence=0.996353268623352)

In [11]:
(pl.x, pl.y, pl.z, pl.presence, pl.visibility)

(0.5407364368438721,
 0.30609118938446045,
 -0.34546229243278503,
 0.996353268623352,
 0.998515784740448)

In [12]:
pwls = res0.pose_world_landmarks[0]
pwls[:4]

[Landmark(x=0.01057900208979845, y=-0.6552597284317017, z=-0.17566275596618652, visibility=0.998515784740448, presence=0.996353268623352),
 Landmark(x=0.012575844302773476, y=-0.6912980079650879, z=-0.15534329414367676, visibility=0.9935886859893799, presence=0.9960289001464844),
 Landmark(x=0.013005340471863747, y=-0.6916921734809875, z=-0.15483617782592773, visibility=0.9948229789733887, presence=0.9960538148880005),
 Landmark(x=0.012934229336678982, y=-0.6921127438545227, z=-0.15491509437561035, visibility=0.993442177772522, presence=0.9956287145614624)]

In [13]:
pwl = pwls[0]
pwl

Landmark(x=0.01057900208979845, y=-0.6552597284317017, z=-0.17566275596618652, visibility=0.998515784740448, presence=0.996353268623352)

In [14]:
(pwl.x, pwl.y, pwl.z, pwl.presence, pwl.visibility)

(0.01057900208979845,
 -0.6552597284317017,
 -0.17566275596618652,
 0.996353268623352,
 0.998515784740448)

In [15]:
type(pl)

mediapipe.tasks.python.components.containers.landmark.NormalizedLandmark

In [16]:
pl?

[0;31mType:[0m        NormalizedLandmark
[0;31mString form:[0m NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446045, z=-0.34546229243278503, visibility=0.998515784740448, presence=0.996353268623352)
[0;31mFile:[0m        ~/Desktop/chu-sama/PythonLab/.venv/lib/python3.12/site-packages/mediapipe/tasks/python/components/containers/landmark.py
[0;31mDocstring:[0m  
A normalized version of above Landmark proto.

All coordinates should be within [0, 1].

Attributes:
  x: The normalized x coordinate.
  y: The normalized y coordinate.
  z: The normalized z coordinate.
  visibility: Landmark visibility. Should stay unset if not supported. Float
    score of whether landmark is visible or occluded by other objects.
    Landmark considered as invisible also if it is not present on the screen
    (out of scene bounds). Depending on the model, visibility value is either
    a sigmoid or an argument of sigmoid.
  presence: Landmark presence. Should stay unset if not supported. Floa

In [17]:
res0?

[0;31mType:[0m        PoseLandmarkerResult
[0;31mString form:[0m PoseLandmarkerResult(pose_landmarks=[[NormalizedLandmark(x=0.5407364368438721, y=0.30609118938446 <...> 30615234, visibility=0.9224606156349182, presence=0.9867019057273865)]], segmentation_masks=None)
[0;31mFile:[0m        ~/Desktop/chu-sama/PythonLab/.venv/lib/python3.12/site-packages/mediapipe/tasks/python/vision/pose_landmarker.py
[0;31mDocstring:[0m  
The pose landmarks detection result from PoseLandmarker, where each vector element represents a single pose detected in the image.

Attributes:
  pose_landmarks: Detected pose landmarks in normalized image coordinates.
  pose_world_landmarks:  Detected pose landmarks in world coordinates.
  segmentation_masks: Optional segmentation masks for pose.

In [18]:
def result2np(result: vision.PoseLandmarkerResult):
    """ランドマークをnumpyデータに変換する

    Args:
        result (vision.PoseLandmarkerResult): ランドマーク

    Returns:
        NDArray: numpyデータ
    """

    landmarks = result.pose_landmarks[0]
    coords = [(l.x, l.y, l.z) for l in landmarks]
    return np.array(coords)

def valid_result(result: vision.PoseLandmarkerResult):
    """ランドマークがnumpyデータに変換できるかどうかを検証する

    Args:
        result (vision.PoseLandmarkerResult): ランドマーク

    Returns:
        boolean: 変換可能であればTrue、そうでなければFalse
    """
    return 0 < len(result.pose_landmarks)

In [19]:
npa0 = result2np(res0)
npa0.shape

(33, 3)

## 学習

In [24]:
results_dict = {v.stem: list(video2landmarks(str(v))) for v in videos}

I0000 00:00:1718128443.845760 3625689 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1
W0000 00:00:1718128443.991036 3673623 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1718128444.016050 3673623 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1718128483.456868 3625689 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1
W0000 00:00:1718128483.540954 3674685 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1718128483.548174 3674688 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1718128522.128646 3625689 gl_context

In [25]:
min_results = min(results_dict.values(), key=lambda v: len(v))
len(min_results)

2036

In [26]:
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

In [27]:
sample_size = 2000
samples = [(label, [r for r in results if valid_result(r)][:sample_size]) for label, results in results_dict.items()]
samples.sort(key=lambda label, : label)

In [28]:
labels = [label for label, _ in samples]
labels

['a',
 'bad',
 'c',
 'horizontal',
 'left',
 'm',
 'none',
 'ok',
 'right',
 'vertical1',
 'vertical2',
 'y']

In [29]:
dataset = [(result2np(result), i) for i, (_, results)in enumerate(samples) for result in results]
dataset?

[0;31mType:[0m        list
[0;31mString form:[0m
[(array([[ 0.49075487,  0.33009747,  0.01464367],
           [ 0.49692717,  0.32248813,  0.05555171], <...>   [ 0.52886158,  0.75753295,  0.13633664],
           [ 0.35447949,  0.74966198,  0.2082331 ]]), 11)]
[0;31mLength:[0m      24000
[0;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.

In [30]:
x = np.array([data for data, _ in dataset])
y = np.array([label for _, label in dataset])

In [31]:
input_shape = (x.shape[1], x.shape[2])
label_len = len(labels)

In [32]:
keras_model = keras.Sequential([
    keras.layers.InputLayer(input_shape, name='input'),
    keras.layers.Flatten(name='Flatten'),
    keras.layers.Dense(64, activation='elu', name='Dense'),
    keras.layers.Dropout(0.2, name='Dropout'),
    keras.layers.Dense(label_len, activation='softmax', name='Output'),
])

In [33]:
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.8, random_state=42)

In [34]:
keras_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = keras_model.fit(train_x, train_y, epochs=20, validation_split=0.2)
history

Epoch 1/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 511us/step - accuracy: 0.5573 - loss: 1.6687 - val_accuracy: 0.9727 - val_loss: 0.3318
Epoch 2/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403us/step - accuracy: 0.9700 - loss: 0.2733 - val_accuracy: 0.9893 - val_loss: 0.1057
Epoch 3/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406us/step - accuracy: 0.9883 - loss: 0.1040 - val_accuracy: 0.9914 - val_loss: 0.0565
Epoch 4/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 405us/step - accuracy: 0.9943 - loss: 0.0570 - val_accuracy: 0.9930 - val_loss: 0.0379
Epoch 5/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 402us/step - accuracy: 0.9960 - loss: 0.0372 - val_accuracy: 0.9948 - val_loss: 0.0246
Epoch 6/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403us/step - accuracy: 0.9956 - loss: 0.0282 - val_accuracy: 0.9966 - val_loss: 0.0192
Epoch 7/20
[1m4

<keras.src.callbacks.history.History at 0x377855e20>

In [35]:
keras_model.evaluate(test_x, test_y)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259us/step - accuracy: 1.0000 - loss: 3.5286e-04


[0.0005930567276664078, 0.99979168176651]

In [36]:
keras_model.export('models/pose')

INFO:tensorflow:Assets written to: models/pose/assets


INFO:tensorflow:Assets written to: models/pose/assets


Saved artifact at 'models/pose'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 33, 3), dtype=tf.float32, name='input')
Output Type:
  TensorSpec(shape=(None, 12), dtype=tf.float32, name=None)
Captures:
  14918030096: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14918031248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14918031056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14918030672: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [37]:
conv = tf.lite.TFLiteConverter.from_saved_model('models/pose')
tflite_model = conv.convert()
with open('models/pose.tflite', 'wb') as f:
    f.write(tflite_model)

W0000 00:00:1718128922.114119 3625689 tf_tfl_flatbuffer_helpers.cc:390] Ignored output_format.
W0000 00:00:1718128922.114277 3625689 tf_tfl_flatbuffer_helpers.cc:393] Ignored drop_control_dependency.


## 推論

In [38]:
interpreter = tf.lite.Interpreter('models/pose.tflite')
interpreter.allocate_tensors()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [39]:
input_details = interpreter.get_input_details()
input_details

[{'name': 'serving_default_input:0',
  'index': 0,
  'shape': array([ 1, 33,  3], dtype=int32),
  'shape_signature': array([-1, 33,  3], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [40]:
output_details = interpreter.get_output_details()
output_details

[{'name': 'StatefulPartitionedCall_1:0',
  'index': 10,
  'shape': array([ 1, 12], dtype=int32),
  'shape_signature': array([-1, 12], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [41]:
import random

In [42]:
input_data, ans = random.choice(dataset)
input_data = input_data.astype(np.float32)
input_data = input_data.reshape(1, 33, 3)
input_data.shape

(1, 33, 3)

In [43]:
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()

In [44]:
output_data = interpreter.get_tensor(output_details[0]['index'])
output_data

array([[3.1520654e-11, 4.2522483e-09, 6.9360995e-10, 6.0986358e-06,
        2.3076971e-06, 9.9998844e-01, 1.4696198e-08, 6.4669717e-07,
        3.0672009e-08, 2.3269968e-06, 1.9794879e-08, 1.0363617e-09]],
      dtype=float32)

In [45]:
np.argmax(output_data[0])

5

In [46]:
ans

5

In [47]:
import time


with vision.PoseLandmarker.create_from_options(landmarkder_options) as landmarker:
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        raise

    while True:
        ret, cv2_image = cap.read()
        if not ret:
            continue

        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2_image)
        timestamp = int(time.time() * 1000)
        pose_landmarker_result = landmarker.detect_for_video(mp_image, timestamp)

        if not valid_result(pose_landmarker_result):
            continue

        input_data = result2np(pose_landmarker_result)
        input_data = input_data.astype(np.float32)
        input_data = input_data.reshape(1, 33, 3)

        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()

        output_data = interpreter.get_tensor(output_details[0]['index'])
        label_index = np.argmax(output_data[0])
        label = labels[label_index]

        cv2.putText(cv2_image, label, (20, 160), cv2.FONT_HERSHEY_SIMPLEX, 6, (255, 255, 255), 12)
        cv2.imshow('Sample Video', cv2_image)

        if cv2.waitKey(5) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

I0000 00:00:1718149970.443275 3625689 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1
W0000 00:00:1718149970.597646 3713659 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1718149970.622905 3713657 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
