用torch构建一个transformer分类网络，输入是一张人脸的468个landmark，每个landmark有x,y,z，输出为八个表情中的一个，比如[0,1,0,0,0,0,0,0]。 使用 Xavier 正态初始化模型参数。数据集里用76个实例训练这个网络，用24个实例测试这个网络，最后用网络预测一个新的实例的表情结果

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads, num_layers, hidden_dim):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.Softmax(dim=1)
        
        # Xavier 正态初始化
        nn.init.xavier_normal_(self.embedding.weight)
        nn.init.xavier_normal_(self.fc.weight)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # 全局平均池化
        x = self.fc(x)
        return x # 返回 logits

# 参数设置
input_dim = 3  # 每个 landmark 的特征维度 (x, y, z)
num_classes = 8  # 表情类别数
num_heads = 8  # 多头注意力机制的头数
num_layers = 6  # Transformer 编码器层数
hidden_dim = 256  # 隐藏层维度

In [100]:
import os
import cv2
import mediapipe as mp
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelBinarizer

# 初始化列表来存储数据
data = []

# 初始化 Mediapipe 的 Face Landmarker
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)

# 遍历数据集目录
dataset_dir = r'C:\Users\larei\Desktop\face\facial_emotion_dataset-main'
for dataset_type in ['train', 'test']:
    dataset_type_dir = os.path.join(dataset_dir, dataset_type)
    for emotion in os.listdir(dataset_type_dir):
        emotion_dir = os.path.join(dataset_type_dir, emotion)
        if os.path.isdir(emotion_dir):
            for img_name in os.listdir(emotion_dir):
                img_path = os.path.join(emotion_dir, img_name)
                
                # 读取图像
                img = cv2.imread(img_path)
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                
                # 获取landmarks
                results = face_mesh.process(img_rgb)
                
                # 提取landmarks坐标
                if results.multi_face_landmarks:
                    landmarks = []
                    for face_landmarks in results.multi_face_landmarks:
                        # 提取每个面部的landmarks (x, y, z)
                        landmarks_per_face = [(lm.x, lm.y, lm.z) for lm in face_landmarks.landmark]
                        
                        # 计算该面部的平均x, y, z坐标  以鼻尖代替
                        avg_x = landmarks_per_face[30][0]
                        avg_y = landmarks_per_face[30][1]
                        avg_z = landmarks_per_face[30][2]
                        
                        # 计算每个关键点相对于鼻尖坐标的差值,扩大一千倍
                        normalized_landmarks = [(100*(lm[0] - avg_x), 100*(lm[1] - avg_y), 100*(lm[2] - avg_z)) for lm in landmarks_per_face]
                        
                        # 将差值landmarks保存
                        landmarks.extend(normalized_landmarks)
                    
                    # 如果没有检测到人脸，landmarks 为 None
                    landmarks = landmarks
                else:
                    landmarks = None
                
                # 将数据添加到列表
                data.append([dataset_type, img_name, img_path, emotion, landmarks])

# 创建DataFrame
df = pd.DataFrame(data, columns=['dataset_type', 'image_name', 'image_path', 'emotion', 'landmarks'])
df.dropna(inplace=True)  # 删除没有检测到人脸的数据




In [101]:
print(df.head())

  dataset_type    image_name  \
0        train   Anger_0.jpg   
1        train   Anger_1.jpg   
3        train  Anger_11.jpg   
5        train  Anger_13.jpg   
6        train   Anger_2.jpg   

                                          image_path emotion  \
0  C:\Users\larei\Desktop\face\facial_emotion_dat...   angry   
1  C:\Users\larei\Desktop\face\facial_emotion_dat...   angry   
3  C:\Users\larei\Desktop\face\facial_emotion_dat...   angry   
5  C:\Users\larei\Desktop\face\facial_emotion_dat...   angry   
6  C:\Users\larei\Desktop\face\facial_emotion_dat...   angry   

                                           landmarks  
0  [(18.31095516681671, 21.317780017852783, -4.59...  
1  [(14.778375625610352, 14.856821298599243, -1.0...  
3  [(25.27611255645752, 22.1016526222229, -6.2335...  
5  [(14.655089378356934, 25.069576501846313, -1.4...  
6  [(14.81524109840393, 22.831368446350098, 1.255...  


In [102]:
# 使用 LabelBinarizer 将情感标签转换为独热编码
lb = LabelBinarizer()
df['emotion_labels'] = lb.fit_transform(df['emotion']).tolist()

# 分离训练集和测试集
train_data = df[df['dataset_type'] == 'train']
test_data = df[df['dataset_type'] == 'test']

# 提取 landmarks 和 labels
train_landmarks = train_data['landmarks'].tolist()
train_labels = train_data['emotion_labels'].tolist()
test_landmarks = test_data['landmarks'].tolist()
test_labels = test_data['emotion_labels'].tolist()

# 创建自定义数据集类
class EmotionDataset(Dataset):
    def __init__(self, landmarks, labels):
        self.landmarks = landmarks
        self.labels = labels

    def __len__(self):
        return len(self.landmarks)

    def __getitem__(self, idx):
        landmark = self.landmarks[idx]
        label = self.labels[idx]
        return torch.tensor(landmark, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# 创建训练和测试数据集
train_dataset = EmotionDataset(train_landmarks, train_labels)
test_dataset = EmotionDataset(test_landmarks, test_labels)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 打印前几行数据

In [103]:
print(train_dataset[1])

(tensor([[14.7784, 14.8568, -1.0138],
        [18.1213, 11.6289, -6.3756],
        [17.1621, 12.7439, -1.8006],
        ...,
        [22.6101, -2.2847,  4.6187],
        [20.5004, -0.6265,  4.6187],
        [22.9680,  0.7595,  4.6187]]), tensor([0., 1., 0., 0., 0., 0., 0., 0.]))


In [104]:
# 初始化模型
model = TransformerClassifier(input_dim, num_classes, num_heads, num_layers, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TransformerClassifier(
  (embedding): Linear(in_features=3, out_features=256, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=256, out_features=8, bias=True)
  (softmax): Softmax(dim=1)
)

In [105]:
# 设置优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

# 训练过程
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for landmarks, labels in train_loader:
        landmarks, labels = landmarks.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(landmarks)
        loss = loss_fn(outputs, labels)
        
        # 反向传播
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

Epoch [1/10], Loss: 2.8472
Epoch [2/10], Loss: 2.6119
Epoch [3/10], Loss: 2.4026
Epoch [4/10], Loss: 2.2710
Epoch [5/10], Loss: 2.2211
Epoch [6/10], Loss: 2.1738
Epoch [7/10], Loss: 2.1247
Epoch [8/10], Loss: 2.1699
Epoch [9/10], Loss: 2.1680
Epoch [10/10], Loss: 2.1963


In [107]:
import numpy as np

# 测试过程
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        
        outputs = model(imgs)
        _, preds = torch.max(outputs, 1)
        
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# 打印预测结果和真实标签
print("Predictions:", all_preds)
print("True Labels:", np.argmax(all_labels, axis=1))

Predictions: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
True Labels: [1 1 1 1 0 0 0 2 2 3 3 3 4 4 4 5 5 5 6 6 6 6 7 7]


In [22]:
# 假设 new_instance 是一个新的实例
new_instance = torch.randn(1, 468, 3).to(device)

# 使用模型进行预测
model.eval()
with torch.no_grad():
    output = model(new_instance)
    _, predicted = torch.max(output, 1)

print("Predicted class for new instance:", predicted.item())

Predicted class for new instance: 2


In [124]:
import cv2
import time
import torch
import mediapipe as mp

# 情感类别
emotions = ['Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise', 'Neutral']
# 初始化 Mediapipe 的 Face Landmarker
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)

# 捕捉摄像头视频流
cap = cv2.VideoCapture(0)

# 获取当前时间
last_capture_time = time.time()
    

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    # 获取当前时间
    current_time = time.time()
    # 每5秒取一次视频帧
    if current_time - last_capture_time >= 5:
        last_capture_time = current_time

        # 将图像转为 RGB 格式（MediaPipe 要求）
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        result = face_mesh.process(rgb_frame)

        if result.multi_face_landmarks:
            for face_landmarks in result.multi_face_landmarks:
                # 提取landmarks坐标
                landmarks = [(lm.x, lm.y, lm.z) for lm in face_landmarks.landmark]
                # 计算该面部的平均x, y, z坐标  以鼻尖代替
                avg_x = landmarks[30][0]
                avg_y = landmarks[30][1]
                avg_z = landmarks[30][2]
                            
                # 计算每个关键点相对于鼻尖坐标的差值
                normalized_landmarks =  [ (100*(lm[0] - avg_x), (100*(lm[1] - avg_y)), (100*(lm[2] - avg_z))) for lm in landmarks_per_face ]
                print(normalized_landmarks)
                normalized_landmarks = torch.tensor(normalized_landmarks, dtype=torch.float32).unsqueeze(0).to(device)
                
                # 预测类别
                with torch.no_grad():
                    outputs = model(normalized_landmarks)
                    _, preds = torch.max(outputs, dim=1)
                # 显示预测的情感
                predicted_emotion = emotions[preds.item()]
                print(f"Predicted Emotion: {predicted_emotion}")
                # 在视频中显示预测的情感
                cv2.putText(frame, f'Emotion: {predicted_emotion}', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    # 显示视频
    cv2.imshow("Emotion Detection", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

# 释放资源
cap.release()
cv2.destroyAllWindows()


[(17.621922492980957, -1.0776817798614502, -2.2910410771146417), (17.34093427658081, -5.472433567047119, -4.439623304642737), (17.384380102157593, -4.1757166385650635, -2.2607352817431092), (16.62866473197937, -10.7977956533432, -3.565204911865294), (17.323148250579834, -7.029712200164795, -4.771327623166144), (17.310088872909546, -9.205469489097595, -4.516464495100081), (17.27951169013977, -14.789032936096191, -2.5489514926448464), (11.418253183364868, -15.502440929412842, 0.462838145904243), (17.19226837158203, -19.697070121765137, -2.1857243264093995), (17.15678572654724, -22.001361846923828, -2.4666125187650323), (17.05150604248047, -29.993468523025513, -2.099346392787993), (17.632973194122314, -0.6149768829345703, -2.1848526084795594), (17.64354109764099, -0.16070008277893066, -1.889075548388064), (17.652064561843872, 0.11895298957824707, -1.4360968256369233), (17.613816261291504, 3.89711856842041, -0.9408043930307031), (17.63109564781189, 4.6218931674957275, -0.9328703628852963),

KeyboardInterrupt: 

: 