# Motion-diffusion Text2Motion Demo

## Setup packages

In [None]:
import os
import gdown
!git clone https://github.com/GuyTevet/motion-diffusion-model.git

In [None]:
os.chdir("motion-diffusion-model")
!sudo apt install ffmpeg

# setup python pakage
!pip install --upgrade --no-cache-dir gdown
!gdown "https://drive.google.com/uc?id=1p062yytbpR4U5Lpr6OE-5lf-J5IWWVjy"
!pip install -r requirements.txt

# setup nlp lib
!python3 -m spacy download en_core_web_sm
!pip install git+https://github.com/openai/CLIP.git
!bash prepare/download_glove.sh

In [None]:
# download smpl files
!mkdir -p body_models
os.chdir("body_models")

!echo -e "The smpl files will be stored in the 'body_models/smpl/' folder\n"
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1INYlGA76ak_cKGzvpOV2Pe6RkYTlXTW2' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1INYlGA76ak_cKGzvpOV2Pe6RkYTlXTW2" -O smpl.zip && rm -rf /tmp/cookies.txt
!rm -rf smpl

!unzip smpl.zip
!echo -e "Cleaning\n"
!rm smpl.zip

!echo -e "Downloading done!"
os.chdir("..")

In [None]:
# download t2m evaluators
!echo -e "Downloading T2M evaluators"
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1DSaKqWX2HlwBtVH5l7DdW96jeYUIXsOP' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1DSaKqWX2HlwBtVH5l7DdW96jeYUIXsOP" -O t2m.zip && rm -rf /tmp/cookies.txt
!rm -rf t2m

!unzip t2m.zip
!echo -e "Cleaning\n"
!rm t2m.zip

!echo -e "Downloading done!"

### Data

In [None]:
os.chdir("..")
!git clone https://github.com/EricGuo5513/HumanML3D.git
!unzip ./HumanML3D/HumanML3D/texts.zip -d ./HumanML3D/HumanML3D/
!cp -r HumanML3D/HumanML3D motion-diffusion-model/dataset/HumanML3D

### Pretrain model

In [None]:
os.chdir("motion-diffusion-model")
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1f8Sd_G53nWxDMvkTnzoouY-8xwOSiW0W" -O model.zip && rm -rf /tmp/cookies.txt
!unzip model.zip -d ./save/

## Generate
generate with one line text
- motion_length 動作秒數 2~9.8s
- tetxt_prompt 句子
- num repetitions 生成數量
- gpu 是否使用 gpu

In [None]:
seed = 12
num_repetitions = 3
motion_length = 6
text = " a man steps forward and does a handstand."
gpu = True

In [None]:
!mkdir "../gen"
# generate location npy and mp4
!python3 -m sample.generate --model_path ./save/humanml_trans_enc_512/model000200000.pt \
    --seed $seed\
    --cuda $gpu\
    --text_prompt "${text}" \
    --motion_length $motion_length \
    --num_repetitions $num_repetitions \
    --output_dir "../gen"

## Visualize bone video

In [None]:
from ipywidgets import Output, GridspecLayout
from IPython import display

os.chdir("..")
grid = GridspecLayout(num_repetitions+2, 1)

i = 0
for df in os.listdir("gen"):
    out = Output()
    if df.endswith(".mp4"):
      with out:
          display.display(display.Video(os.path.join("gen", df), embed=True))
      grid[i, 0] = out
      i += 1
os.chdir("motion-diffusion-model")
grid

##Bones npy to gltf
輸出 gltf 檔案
- target 指定輸出第幾個sample

In [None]:
!pip install pygltflib
import base64
import numpy as np
import pygltflib
from pygltflib import GLTF2, Scene

###Chidren connect ver

In [None]:
nodes = [
 {"children": [1],
  'name': 'Root'},
 {"children": [2,3,4],
  'name': 'MidHip'},
 {"children": [5],
  'name': 'LHip'},
 {"children": [6],
  'name': 'RHip'},
 {"children": [7],
  'name': 'spine1'},
 {"children": [8],
  'name': 'LKnee'},
 {"children": [9],
  'name': 'RKnee'},
 {"children": [10],
  'name': 'spine2'},
 {"children": [11],
  'name': 'LAnkle'},
 {"children": [12],
  'name': 'RAnkle'},
 {"children": [13,14,15],
  'name': 'spine3'},
 {'name': 'LFoot'},
 {'name': 'RFoot'},
 {"children": [16],
  'name': 'Neck'},
 {"children": [17],
  'name': 'LCollar'},
 {"children": [18],
  'name': 'Rcollar'},
 {'name': 'Head'},
 {"children": [19],
  'name': 'LShoulder'},
 {"children": [20],
  'name': 'RShoulder'},
 {"children": [21],
  'name': 'LElbow'},
 {"children": [22],
  'name': 'RElbow'},
 {'name': 'LWrist'},
 {'name': 'RWrist'}
 ]

scene_nodes = list(range(len(nodes)))

In [None]:
npy = np.load("../gen/results.npy", allow_pickle=True).tolist() # shape (repetitions, points, (x,y,z), frames)
target = 0 
motion = np.append([np.zeros((3, 120))], (npy['motion'][target]), axis=0) # Add root node and set to (0,0,0)
frames = npy['lengths'][0]


# append all nodes location data except root
parent_dict = {}
for i, node in enumerate(nodes):
    for child in node.get("children", []):
        parent_dict[child] = i

points = []
for node in range(len(nodes)):
  x, y, z = motion[node]
  px, py, pz = motion[parent_dict[node]] if node != 0 else np.zeros((3,frames))
  for i in range(frames):
    points.append((x[i]-px[i], y[i]-py[i], z[i]-pz[i]))

points = np.array(points, dtype=np.float32)
times = np.array(np.arange(0, motion_length, motion_length/frames), dtype=np.float32)

In [None]:
def xyz2gltf(scene_nodes, nodes, points, times, save_path):
  gltf = GLTF2()
  gltf.scene = 0

  # scene
  scene = Scene(nodes=[0])
  gltf.scenes.append(scene)
  gltf.nodes = nodes

  # buffer
  uri = "data:application/octet-stream;base64," + base64.b64encode(times).decode() + base64.b64encode(points).decode()
  buf = pygltflib.Buffer(uri=uri, byteLength=times.nbytes+points.nbytes)
  gltf.buffers.append(buf)

  # bufferview
  buf_view_time = pygltflib.BufferView(buffer=0, byteLength=times.nbytes, name="bufferViewAnimationFloatScalar")
  buf_view_point = pygltflib.BufferView(buffer=0, byteOffset=times.nbytes, byteLength=points.nbytes, name="bufferViewAnimationFloatVec3")
  gltf.bufferViews.append(buf_view_time)
  gltf.bufferViews.append(buf_view_point)

  # accessor
  time_accessor = pygltflib.Accessor(
        bufferView = 0,
        componentType = 5126,
        count = int(times.nbytes/4),
        max = [
          float(times[-1])
        ],
        min = [
          0
        ],
        type = "SCALAR",
        name = "accessorAnimationInput"
      )
  gltf.accessors.append(time_accessor)

  byteOffset = 0
  count=int(points.nbytes/(12*len(scene_nodes)))
  for i in range(len(scene_nodes)):
    accessor = pygltflib.Accessor(
        bufferView=1,
        byteOffset=byteOffset,
        componentType=5126,
        count=int(points.nbytes/(12*len(scene_nodes))),
        type='VEC3',
        name="accessorAnimationPositions"
    )
    byteOffset += count*12
    gltf.accessors.append(accessor)


  #animaiton
  channels, samplers = [], []
  for i in range(len(scene_nodes)):
    sampler = pygltflib.AnimationSampler(input=0, output=i+1)
    channeltarget = pygltflib.AnimationChannelTarget(node=i, path="translation")
    channel = pygltflib.AnimationChannel(sampler=i, target=channeltarget)

    channels.append(channel)
    samplers.append(sampler)

  animation = pygltflib.Animation(name="All Animations", channels=channels, samplers=samplers)
  gltf.animations.append(animation)

  gltf.save(f"{save_path}.gltf")
  print(f"save at {save_path}")
  return gltf

gltf = xyz2gltf(scene_nodes, nodes, points, times, "../gen/result_child")

###No chidren ver

In [None]:
AMASS_JOINT_MAP = {
'MidHip': 0,
'LHip': 1, 'LKnee': 4, 'LAnkle': 7, 'LFoot': 10,
'RHip': 2, 'RKnee': 5, 'RAnkle': 8, 'RFoot': 11,
'LShoulder': 16, 'LElbow': 18, 'LWrist': 20,  
'RShoulder': 17, 'RElbow': 19, 'RWrist': 21, 
'spine1': 3, 'spine2': 6, 'spine3': 9,  'Neck': 12, 'Head': 15,
'LCollar':13, 'Rcollar' :14, 
}
scene_nodes = list(range(len(AMASS_JOINT_MAP)))
inv_map = {v: k for k, v in AMASS_JOINT_MAP.items()}
nodes = list(map(lambda x:{'name':inv_map[x]}, scene_nodes))

In [None]:
npy = np.load("../gen/results.npy", allow_pickle=True).tolist() # shape (repetitions, points, (x,y,z), frames)
keys = npy.keys()
motion = npy['motion']
frames = npy['lengths'][0]
target = 2
points = []

for node in range(22):
  x, y, z = motion[target][node]
  for i in range(frames):
    points.append((x[i], y[i], z[i]))

points = np.array(points, dtype=np.float32)
times = np.array(np.arange(0, motion_length, motion_length/frames), dtype=np.float32)

In [None]:
def xyz2gltf(scene_nodes, nodes, points, times, save_path):
  gltf = GLTF2()
  gltf.scene = 0

  # scene
  scene = Scene(nodes=scene_nodes)
  gltf.scenes.append(scene)
  gltf.nodes = nodes

  # buffer
  uri = "data:application/octet-stream;base64," + base64.b64encode(times).decode() + base64.b64encode(points).decode()
  buf = pygltflib.Buffer(uri=uri, byteLength=times.nbytes+points.nbytes)
  gltf.buffers.append(buf)

  # bufferview
  buf_view_time = pygltflib.BufferView(buffer=0, byteLength=times.nbytes, name="bufferViewAnimationFloatScalar")
  buf_view_point = pygltflib.BufferView(buffer=0, byteOffset=times.nbytes, byteLength=points.nbytes, name="bufferViewAnimationFloatVec3")
  gltf.bufferViews.append(buf_view_time)
  gltf.bufferViews.append(buf_view_point)

  # accessor
  time_accessor = pygltflib.Accessor(
        bufferView = 0,
        componentType = 5126,
        count = int(times.nbytes/4),
        max = [
          float(times[-1])
        ],
        min = [
          0
        ],
        type = "SCALAR",
        name = "accessorAnimationInput"
      )
  gltf.accessors.append(time_accessor)

  byteOffset = 0
  count=int(points.nbytes/(12*len(scene_nodes)))
  for i in range(len(scene_nodes)):
    accessor = pygltflib.Accessor(
        bufferView=1,
        byteOffset=byteOffset,
        componentType=5126,
        count=int(points.nbytes/(12*len(scene_nodes))),
        type='VEC3',
        name="accessorAnimationPositions"
    )
    byteOffset += count*12
    gltf.accessors.append(accessor)


  #animaiton
  channels, samplers = [], []
  for i in range(len(scene_nodes)):
    sampler = pygltflib.AnimationSampler(input=0, output=i+1)
    channeltarget = pygltflib.AnimationChannelTarget(node=i, path="translation")
    channel = pygltflib.AnimationChannel(sampler=i, target=channeltarget)

    channels.append(channel)
    samplers.append(sampler)

  animation = pygltflib.Animation(name="All Animations", channels=channels, samplers=samplers)
  gltf.animations.append(animation)

  gltf.save(f"{save_path}.gltf")
  print(f"save at {save_path}")
  return gltf

In [None]:
gltf = xyz2gltf(scene_nodes, nodes, points, times, "../gen/result_nochild")

## E2Pose npy to gltf

In [None]:
nodes = [
 {"children": [1],
  'name':'Root'},
 {"children": [2,5,8],
  'name': 'Hip'},
 {"children": [3],
  'name': 'rhip'},
 {"children": [4],
  'name': 'rknee'},
 {'name': 'rfoot'},
 {"children": [6],
  'name': 'lhip'},
 {"children": [7],
  'name': 'lknee'},
 {'name': 'lfoot'},
 {"children": [9],
  'name': 'spine'},
 {"children": [10,12,15],
  'name': 'spine2'},
 {"children": [11],
  'name': 'knee'},
 {'name': 'head'},
 {"children": [13],
  'name': 'lshoulder'},
 {"children": [14],
  'name': 'lelbow'},
 {'name': 'lhand'},
 {"children": [16],
  'name': 'rshoulder'},
 {"children": [17],
  'name': 'relbow'},
 {'name': 'rhand'}]

scene_nodes = list(range(len(nodes)))

In [None]:
video23d = np.load("../VideoTo3D_dance.npy", allow_pickle=True)
video23d = np.append(np.zeros((len(video23d), 1, 3)), video23d, axis=1)

parent_dict = {}
for i, node in enumerate(nodes):
    for child in node.get("children", []):
        parent_dict[child] = i

points = []
for node in range(len(nodes)):
  for i in range(300, 900):
    x ,y, z = video23d[i][node]
    px, py, pz = video23d[i][parent_dict[node]] if node != 0 else np.zeros(3)
    points.append((x-px, -y+py, z-pz))

points = np.array(points, dtype=np.float32)
times = np.array(np.arange(0, 20, 1/30), dtype=np.float32)

In [None]:
def xyz2gltf(scene_nodes, nodes, points, times, save_path):
  gltf = GLTF2()
  gltf.scene = 0

  # scene
  scene = Scene(nodes=[0])
  gltf.scenes.append(scene)
  gltf.nodes = nodes

  # buffer
  uri = "data:application/octet-stream;base64," + base64.b64encode(times).decode() + base64.b64encode(points).decode()
  buf = pygltflib.Buffer(uri=uri, byteLength=times.nbytes+points.nbytes)
  gltf.buffers.append(buf)

  # bufferview
  buf_view_time = pygltflib.BufferView(buffer=0, byteLength=times.nbytes, name="bufferViewAnimationFloatScalar")
  buf_view_point = pygltflib.BufferView(buffer=0, byteOffset=times.nbytes, byteLength=points.nbytes, name="bufferViewAnimationFloatVec3")
  gltf.bufferViews.append(buf_view_time)
  gltf.bufferViews.append(buf_view_point)

  # accessor
  time_accessor = pygltflib.Accessor(
        bufferView = 0,
        componentType = 5126,
        count = int(times.nbytes/4),
        max = [
          float(times[-1])
        ],
        min = [
          0
        ],
        type = "SCALAR",
        name = "accessorAnimationInput"
      )
  gltf.accessors.append(time_accessor)

  byteOffset = 0
  count=int(points.nbytes/(12*len(scene_nodes)))
  for i in range(len(scene_nodes)):
    accessor = pygltflib.Accessor(
        bufferView=1,
        byteOffset=byteOffset,
        componentType=5126,
        count=int(points.nbytes/(12*len(scene_nodes))),
        type='VEC3',
        name="accessorAnimationPositions"
    )
    byteOffset += count*12
    gltf.accessors.append(accessor)


  #animaiton
  channels, samplers = [], []
  for i in range(len(scene_nodes)):
    sampler = pygltflib.AnimationSampler(input=0, output=i+1)
    channeltarget = pygltflib.AnimationChannelTarget(node=i, path="translation")
    channel = pygltflib.AnimationChannel(sampler=i, target=channeltarget)

    channels.append(channel)
    samplers.append(sampler)

  animation = pygltflib.Animation(name="All Animations", channels=channels, samplers=samplers)
  gltf.animations.append(animation)

  gltf.save(f"{save_path}.gltf")
  print(f"save at {save_path}")
  return gltf

In [None]:
gltf = xyz2gltf(scene_nodes, nodes, points, times, "../gen/E2pose")

## Generate mesh glb file (unnecessary)

In [None]:
from model.rotation2xyz import Rotation2xyz
import numpy as np
from trimesh import Trimesh
import os
import torch
from visualize.simplify_loc2rot import joints2smpl

class npy2obj:
    def __init__(self, npy_path, sample_idx, rep_idx, device=0, cuda=True):
        self.npy_path = npy_path
        self.motions = np.load(self.npy_path, allow_pickle=True)
        if self.npy_path.endswith('.npz'):
            self.motions = self.motions['arr_0']
        self.motions = self.motions[None][0]
        self.rot2xyz = Rotation2xyz(device='cpu')
        self.faces = self.rot2xyz.smpl_model.faces
        self.bs, self.njoints, self.nfeats, self.nframes = self.motions['motion'].shape
        self.opt_cache = {}
        self.sample_idx = sample_idx
        self.total_num_samples = self.motions['num_samples']
        self.rep_idx = rep_idx
        self.absl_idx = self.rep_idx*self.total_num_samples + self.sample_idx
        self.num_frames = self.motions['motion'][self.absl_idx].shape[-1]
        self.j2s = joints2smpl(num_frames=self.num_frames, device_id=device, cuda=cuda)

        if self.nfeats == 3:
            print(f'Running SMPLify For sample [{sample_idx}], repetition [{rep_idx}], it may take a few minutes.')
            motion_tensor, opt_dict = self.j2s.joint2smpl(self.motions['motion'][self.absl_idx].transpose(2, 0, 1))  # [nframes, njoints, 3]
            self.motions['motion'] = motion_tensor.cpu().numpy()
        elif self.nfeats == 6:
            self.motions['motion'] = self.motions['motion'][[self.absl_idx]]
        self.bs, self.njoints, self.nfeats, self.nframes = self.motions['motion'].shape
        self.real_num_frames = self.motions['lengths'][self.absl_idx]

        self.vertices = self.rot2xyz(torch.tensor(self.motions['motion']), mask=None,
                                     pose_rep='rot6d', translation=True, glob=True,
                                     jointstype='vertices',
                                     # jointstype='smpl',  # for joint locations
                                     vertstrans=True)
        self.root_loc = self.motions['motion'][:, -1, :3, :].reshape(1, 1, 3, -1)
        self.vertices += self.root_loc

    def get_vertices(self, sample_i, frame_i):
        return self.vertices[sample_i, :, :, frame_i].squeeze().tolist()

    def get_trimesh(self, sample_i, frame_i):
        return Trimesh(vertices=self.get_vertices(sample_i, frame_i),
                       faces=self.faces)

    def save_obj(self, save_path, frame_i):
        mesh = self.get_trimesh(0, frame_i)
        mesh.export(f'{save_path}.glb')
        return save_path
    
    def save_npy(self, save_path):
        data_dict = {
            'motion': self.motions['motion'][0, :, :, :self.real_num_frames],
            'thetas': self.motions['motion'][0, :-1, :, :self.real_num_frames],
            'root_translation': self.motions['motion'][0, -1, :3, :self.real_num_frames],
            'faces': self.faces,
            'vertices': self.vertices[0, :, :, :self.real_num_frames],
            'text': self.motions['text'][0],
            'length': self.real_num_frames,
        }
        np.save(save_path, data_dict)
        return

In [None]:
import argparse
import os
from visualize import vis_utils
import shutil
from tqdm import tqdm

def render_mesh(input_path):
  parser = argparse.ArgumentParser()
  parser.add_argument("--input_path", type=str, required=True, help='stick figure mp4 file to be rendered.')
  parser.add_argument("--cuda", type=bool, default=True, help='')
  parser.add_argument("--device", type=int, default=0, help='')
  params = parser.parse_args(args=["--input_path", input_path])

  assert params.input_path.endswith('.mp4')
  parsed_name = os.path.basename(params.input_path).replace('.mp4', '').replace('sample', '').replace('rep', '')
  sample_i, rep_i = [int(e) for e in parsed_name.split('_')]
  npy_path = os.path.join(os.path.dirname(params.input_path), 'results.npy')
  out_npy_path = params.input_path.replace('.mp4', '_smpl_params.npy')
  assert os.path.exists(npy_path)
  results_dir = params.input_path.replace('.mp4', '_glb')
  if os.path.exists(results_dir):
      shutil.rmtree(results_dir)
  os.makedirs(results_dir)

  npy2obj = vis_utils.npy2obj(npy_path, sample_i, rep_i,
                              device=params.device, cuda=params.cuda)

  print('Saving obj files to [{}]'.format(os.path.abspath(results_dir)))
  for frame_i in tqdm(range(npy2obj.real_num_frames)):
      npy2obj.save_obj(os.path.join(results_dir, 'frame{:03d}'.format(frame_i)), frame_i)

  print('Saving SMPL params to [{}]'.format(os.path.abspath(out_npy_path)))
  npy2obj.save_npy(out_npy_path)
  return

In [None]:
render_mesh("../gen/sample00_rep00.mp4")