In [3]:
%pip install smplx torch open3d


Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import torch.nn as nn

# Set the device (ensure you are using the same device as during training)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
class TextToMotionTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, num_layers=4, num_heads=8):
        super(TextToMotionTransformer, self).__init__()
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Linear layers for predicting separate motion parameters
        self.fc_root = nn.Linear(input_dim, 3)          # Root pose (3)
        self.fc_body = nn.Linear(input_dim, 63)         # Body pose (63)
        self.fc_left_hand = nn.Linear(input_dim, 45)    # Left hand pose (45)
        self.fc_right_hand = nn.Linear(input_dim, 45)   # Right hand pose (45)

    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: (batch_size, seq_length, input_dim)
        # attention_mask: (batch_size, seq_length)

        # Permute token_embeddings to match Transformer input requirements
        token_embeddings = token_embeddings.permute(1, 0, 2)  # (seq_length, batch_size, input_dim)

        # Convert attention mask to key_padding_mask (True for padding tokens)
        key_padding_mask = attention_mask == 0  # Shape: (batch_size, seq_length)

        # Pass through Transformer Encoder
        transformer_output = self.transformer_encoder(
            token_embeddings,
            src_key_padding_mask=key_padding_mask  # (batch_size, seq_length)
        )  # (seq_length, batch_size, input_dim)

        # Transpose back to (batch_size, seq_length, input_dim)
        transformer_output = transformer_output.permute(1, 0, 2)

        # Predict motion parameters
        root_pose = self.fc_root(transformer_output)           # (batch_size, seq_length, 3)
        body_pose = self.fc_body(transformer_output)           # (batch_size, seq_length, 63)
        left_hand_pose = self.fc_left_hand(transformer_output) # (batch_size, seq_length, 45)
        right_hand_pose = self.fc_right_hand(transformer_output) # (batch_size, seq_length, 45)

        return root_pose, body_pose, left_hand_pose, right_hand_pose

# Function to load the trained model and embeddings
def load_model_and_embeddings(model_path, embedding_path):
    # Load the trained Transformer model
    model = TextToMotionTransformer(input_dim=768)  # Ensure input_dim matches the one used during training
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    # Load embeddings and attention masks
    saved_data = torch.load(embedding_path, map_location=device)
    train_embeddings = saved_data['train_embeddings']
    train_attention_masks = saved_data['train_attention_masks']
    test_embeddings = saved_data['test_embeddings']
    test_attention_masks = saved_data['test_attention_masks']
    validation_embeddings = saved_data['validation_embeddings']
    validation_attention_masks = saved_data['validation_attention_masks']
    
    return model, train_embeddings, train_attention_masks, test_embeddings, test_attention_masks, validation_embeddings, validation_attention_masks

# Specify the paths for the model and embeddings
#model_path = r"C:\\Users\Admin\\Desktop\\text to motion transformer\\text_to_motion_transformer.pth"
#embedding_path = r"C:\\Users\\Admin\\Desktop\\text to motion transformer\\embeddings_and_masks.pth"  # Update with actual path to your embeddings

# Load the model and embeddings
#model, train_embeddings, train_attention_masks, test_embeddings, test_attention_masks, validation_embeddings, validation_attention_masks = load_model_and_embeddings(model_path, embedding_path)

Using device: cuda


In [5]:
# File paths for the saved model and embeddings
model_path = "text_to_motion_transformer.pth"
embedding_path = "embeddings_and_masks.pth"

# Load the model and embeddings
model, train_embeddings, train_attention_masks, test_embeddings, test_attention_masks, validation_embeddings, validation_attention_masks = load_model_and_embeddings(model_path, embedding_path)
print("Model and embeddings loaded successfully!")


  model.load_state_dict(torch.load(model_path, map_location=device))
  saved_data = torch.load(embedding_path, map_location=device)


Model and embeddings loaded successfully!


In [6]:
# Load the BERT tokenizer and model (same as used during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()

# Function to generate BERT embeddings for a given text input
def generate_bert_embeddings(text):
    # Tokenize and generate token embeddings
    inputs = tokenizer(
        text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=128,  # Ensure this matches the MAX_SEQ_LENGTH used during training
        return_attention_mask=True
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Get the token embeddings (last hidden state)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Remove batch dimension
    attention_mask = inputs['attention_mask'].squeeze(0)     # Remove batch dimension

    return token_embeddings, attention_mask




In [7]:
# Example text input
text_input = "And just let those fingers relax"

# Generate BERT embeddings for the input text
text_embeddings, attention_mask = generate_bert_embeddings(text_input)

# Reshape embeddings and attention mask to match the expected input format
text_embeddings = text_embeddings.unsqueeze(0)  # Shape: (1, seq_length, hidden_size)
attention_mask = attention_mask.unsqueeze(0)    # Shape: (1, seq_length)

# Move to the device
text_embeddings = text_embeddings.to(device)
attention_mask = attention_mask.to(device)

# Pass through the model to generate motion parameters
with torch.no_grad():
    root_pose_pred, body_pose_pred, left_hand_pred, right_hand_pred = model(text_embeddings, attention_mask)

# Print the predicted motion parameters
print("Left Hand Pose Prediction:\n", left_hand_pred.cpu().numpy())
print("Right Hand Pose Prediction:\n", right_hand_pred.cpu().numpy())
print("Root Pose Prediction:\n", root_pose_pred.cpu().numpy())
print("Body Pose Prediction:\n", body_pose_pred.cpu().numpy())

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Left Hand Pose Prediction:
 [[[-2.1505181e-03  1.1478206e-03 -4.1114371e-03 ... -8.7986584e-05
    6.2221894e-05  9.1558543e-04]
  [-2.1505186e-03  1.1478201e-03 -4.1114353e-03 ... -8.7985885e-05
    6.2221778e-05  9.1558555e-04]
  [-2.1505181e-03  1.1478201e-03 -4.1114353e-03 ... -8.7986351e-05
    6.2221778e-05  9.1558555e-04]
  ...
  [-2.1505181e-03  1.1478201e-03 -4.1114353e-03 ... -8.7986235e-05
    6.2221545e-05  9.1558549e-04]
  [-2.1505184e-03  1.1478201e-03 -4.1114353e-03 ... -8.7986002e-05
    6.2221894e-05  9.1558538e-04]
  [-2.1505184e-03  1.1478201e-03 -4.1114353e-03 ... -8.7986118e-05
    6.2221894e-05  9.1558549e-04]]]
Right Hand Pose Prediction:
 [[[-0.00162534  0.00117521  0.00440383 ...  0.0003007   0.00041492
   -0.00052171]
  [-0.00162534  0.00117521  0.00440383 ...  0.0003007   0.00041492
   -0.00052171]
  [-0.00162534  0.00117521  0.00440383 ...  0.0003007   0.00041492
   -0.00052171]
  ...
  [-0.00162534  0.00117521  0.00440383 ...  0.0003007   0.00041492
   -0.0

In [8]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torch

# Function to visualize the motion sequence in 3D with debugging
def visualize_motion_sequence_debug(root_pose, body_pose, left_hand_pose, right_hand_pose):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    # Print out the values for debugging
    print("Root Pose Sample Frame Values:\n", root_pose[0].cpu().numpy())
    print("Body Pose Sample Frame Values:\n", body_pose[0].cpu().numpy())
    print("Left Hand Pose Sample Frame Values:\n", left_hand_pose[0].cpu().numpy())
    print("Right Hand Pose Sample Frame Values:\n", right_hand_pose[0].cpu().numpy())
    
    # Iterate over frames to plot motion sequence
    for frame_idx in range(root_pose.shape[0]):
        ax.clear()

        # Convert tensors to CPU and then to NumPy for visualization
        root_pos_cpu = root_pose[frame_idx].cpu().numpy()
        body_pos_cpu = body_pose[frame_idx].cpu().numpy()
        left_hand_pos_cpu = left_hand_pose[frame_idx].cpu().numpy()
        right_hand_pos_cpu = right_hand_pose[frame_idx].cpu().numpy()

        # Plot root position (increase point size 's')
        ax.scatter(root_pos_cpu[0], root_pos_cpu[1], root_pos_cpu[2], c='r', s=100, label='Root Pose')

        # Plot body pose (every 3rd value to get x, y, z positions)
        ax.scatter(body_pos_cpu[::3], body_pos_cpu[1::3], body_pos_cpu[2::3], c='b', s=50, label='Body Pose')

        # Plot left hand pose (every 3rd value)
        ax.scatter(left_hand_pos_cpu[::3], left_hand_pos_cpu[1::3], left_hand_pos_cpu[2::3], c='g', s=50, label='Left Hand Pose')

        # Plot right hand pose (every 3rd value)
        ax.scatter(right_hand_pos_cpu[::3], right_hand_pos_cpu[1::3], right_hand_pos_cpu[2::3], c='m', s=50, label='Right Hand Pose')

        # Set expanded axis limits to ensure visibility
        ax.set_xlim([-1, 1])  # Adjust based on your data range
        ax.set_ylim([-1, 1])  # Adjust based on your data range
        ax.set_zlim([-1, 1])  # Adjust based on your data range
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        ax.set_title(f"Frame {frame_idx + 1}")
        plt.legend()

        # Pause to create an animation effect
        plt.pause(0.5)  # Increase pause time for better observation

    #plt.show()

# Call the visualization function
# visualize_motion_sequence_debug(
#     root_pose_pred[0],        # Shape: (seq_length, 3)
#     body_pose_pred[0],        # Shape: (seq_length, 63)
#     left_hand_pred[0],        # Shape: (seq_length, 45)
#     right_hand_pred[0]        # Shape: (seq_length, 45)
# )


In [10]:
import smplx
import torch

# Set paths and device
model_path = "C:\\Users\\Admin\\Desktop\\text to motion transformer\\models\\smplx\\SMPLX_MALE.pkl"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load SMPL-X model
#model = smplx.SMPLX(model_path=model_path, gender='neutral', use_pca=False)
model = smplx.SMPLX(model_path=model_path, gender='neutral', use_pca=False, ext='pkl')
model = model.to(device)


In [21]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
import smplx

# Example: Text input
text_input = "And just let those fingers relax"

# Generate BERT embeddings for the input text
text_embeddings, attention_mask = generate_bert_embeddings(text_input)

# Reshape embeddings to match the expected format
text_embeddings = text_embeddings.unsqueeze(0)  # Shape: (1, seq_len, hidden_size)
attention_mask = attention_mask.unsqueeze(0)    # Shape: (1, seq_len)

# Check shapes
print(f"Text Embeddings Shape: {text_embeddings.shape}")
print(f"Attention Mask Shape: {attention_mask.shape}")

# Load SMPL-X model (adjust the path as necessary)
model_path = "C:\\Users\\Admin\\Desktop\\text to motion transformer\\models\\smplx\\SMPLX_MALE.pkl"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = smplx.SMPLX(model_path=model_path, gender='neutral', use_pca=False, ext='pkl')
model = model.to(device)

# Pass through the model to generate motion parameters
with torch.no_grad():
    root_pose_pred, body_pose_pred, left_hand_pred, right_hand_pred = model(text_embeddings, attention_mask)

# Check shapes for predicted pose parameters
print(f"Root Pose Shape: {root_pose_pred.shape}")        # Shape: (seq_len, 3)
print(f"Body Pose Shape: {body_pose_pred.shape}")        # Shape: (seq_len, 63)
print(f"Left Hand Pose Shape: {left_hand_pred.shape}")   # Shape: (seq_len, 45)
print(f"Right Hand Pose Shape: {right_hand_pred.shape}") # Shape: (seq_len, 45)

# Video generation function
def generate_motion_video(root_pose, body_pose, left_hand_pose, right_hand_pose):
    canvas_width, canvas_height = 600, 600
    video_filename = "motion_video.avi"
    
    # Set up video writer (OpenCV)
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video_writer = cv2.VideoWriter(video_filename, fourcc, 20.0, (640, 480))

    for frame_idx in range(root_pose.shape[0]):
        fig = plt.figure(figsize=(6, 6))
        ax = fig.add_subplot(111, projection='3d')

        # Plotting root pose, body pose, and hand poses
        ax.scatter(root_pose[frame_idx, 0], root_pose[frame_idx, 1], root_pose[frame_idx, 2], c='r', s=100, label='Root Pose')
        ax.scatter(body_pose[frame_idx, ::3], body_pose[frame_idx, 1::3], body_pose[frame_idx, 2::3], c='b', s=50, label='Body Pose')
        ax.scatter(left_hand_pose[frame_idx, ::3], left_hand_pose[frame_idx, 1::3], left_hand_pose[frame_idx, 2::3], c='g', s=50, label='Left Hand Pose')
        ax.scatter(right_hand_pose[frame_idx, ::3], right_hand_pose[frame_idx, 1::3], right_hand_pose[frame_idx, 2::3], c='m', s=50, label='Right Hand Pose')

        ax.set_xlim([-1, 1])
        ax.set_ylim([-1, 1])
        ax.set_zlim([-1, 1])
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        ax.set_title(f"Frame {frame_idx + 1}")
        plt.legend()

        # Capture the current frame as an image
        plt_img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
        plt_img = plt_img.reshape(canvas_height, canvas_width, 3)

        print(f"Captured image size: {plt_img.size}, Expected size: {canvas_height * canvas_width * 3}")

        # Resize image to match the video frame size
        resized_img = cv2.resize(plt_img, (640, 480))

        # Write the resized image to the video
        video_writer.write(resized_img)
        
        # Close the plot
        plt.close(fig)

    # Release the video writer
    video_writer.release()
    print("Video generation complete.")

# Example: Call the function with predicted pose data
generate_motion_video(root_pose_pred.cpu().numpy(), body_pose_pred.cpu().numpy(), left_hand_pred.cpu().numpy(), right_hand_pred.cpu().numpy())


Text Embeddings Shape: torch.Size([1, 1, 8, 768])
Attention Mask Shape: torch.Size([1, 1, 8])


RuntimeError: shape '[-1, 1, 3]' is invalid for input of size 8