In [1]:
import logging
import os
import random
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter
from torch.nn.modules.loss import CrossEntropyLoss
from torch.utils.data import DataLoader, ConcatDataset
from torchvision import transforms
from tqdm import tqdm
import argparse
import os
import random
import numpy as np
import time
import gc
# from networks.vision_transformer import SwinUnet as ViT_seg

# from utils import DiceLoss
# from config import get_config
# from WG_dataset2 import PosNegDataset
from dataset import USDataset

import csv
# from model.encoder import DualImageContrastiveModel
# from model.accVoice_swinUnet import SwinTransformerSys as SwinUnet
# from model.CrossViT3 import ImageCrossViT
from model.stage2_model1 import Stage2Model
# from model.AccSeal_model_stage1 import AccSealModelStage1
# import networks.accVoice_vitUnet as VitUnet
from utils.loss import InfoNCELoss2, InfoNCELoss, BatchInfoNCELoss, FocalLoss
from matplotlib import pyplot as plt

#single GPU
ngpu = 1
use_gpu = torch.cuda.is_available()
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

#multi GPU
gpus = [0,1]
torch.cuda.set_device('cuda:{}'.format(gpus[0]))


def test_model(model):
    
    batch_size = 384

    #test dataset path
    #正样本和负样本，负样本是正样本的10倍数量，合在一起进行推理
    # data_path = r"Z:\dataset\accelerometer_audio\AccSeal\test_dataset_true.csv" #测试正样本
    # data_path = r"Z:\dataset\accelerometer_audio\AccSeal\test_dataset_false.csv" #测试负样本
    data_path = r"E:\dataset\ultrasound_video_audio\DATA\dataset\test_dataset.csv" #负样本




    test_dataset = USDataset(data_path)
    print("====== Test Dataset Count: =======", test_dataset.__len__())


    #output path
    # output_path = r"Z:\dataset\accelerometer_audio\AccSeal\WG\test_dataset_crossUser_false" #negitive samples
    # output_path = r"Z:\dataset\accelerometer_audio\AccSeal\WG\test_dataset_crossUser_true" #positive samples
    # output_path = r"Z:\dataset\accelerometer_audio\AccSeal\WG\stage2_2_test_dataset_true1" #
    output_path = r"E:\dataset\ultrasound_video_audio\record\stage2_test_dataset" #mismatching negitive samples, 只保存预测的label


   

    # 加载预训练模型参数
    # model_path = "./checkpoints/model_epoch_400.pth"  # 使用最后一个epoch的模型
    model_path = "./checkpoints/stage2/best_model.pth"  # 使用best_model
    # model_path = "Z:/dataset/accelerometer/record/RESULT_original_xyzTrans_removeTimbre_average_BIGVGAN_swinUnet_LGDZN/model_epoch_400.pth"  # 使用best_model
    # model_path = r"Z:\dataset\accelerometer_audio\AccVoice\record\RESULT_original_removeTimbre_average_BIGVGAN_swinUnet_crossUser2\best_model.pth"

    model = model.to(device) #single gpu test
    # model = nn.DataParallel(model.cuda(), device_ids=gpus, output_device=gpus[0]) #multi gpu test
    model.load_state_dict(torch.load(model_path))


    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    sample_paths = [test_dataset.get_audio_path(i) for i in range(len(test_dataset))] #读取原始音频路径
    sample_names = [test_dataset.get_audio_name(i) for i in range(len(test_dataset))]
    
    model.eval()
    with torch.no_grad():
        save_path_index = 0
        for batch in test_loader:
            ultrasound = batch['ultrasound']    #[N, 1, 80，500] batch_size, 
            video = batch['video']
            # speaker = batch['speaker']
            # user = batch['user']

            # audio = torch.transpose(audio, -2, -1)  #转置，变成[N, 1, 500, 80]
            # acc = torch.transpose(acc, -2, -1)
            # audio = audio.to(device=device, dtype=torch.float32)  #GPU上训练，数据必须放入GPU，参数在GPU上更新
            # label = label.to(device=device, dtype=torch.float32)
            modal_video = video.cuda(non_blocking=True)  #GPU上训练，数据必须放入GPU，参数在GPU上更新, non_blocking=True启用异步数据传输。
            modal_ultrasound = ultrasound.cuda(non_blocking=True) #将 Tensor 数据从 CPU 移动到 GPU 时，可以通过设置 non_blocking=True 来启用异步传输模式，从而提高计算效率。
            
            time_start = time.time()
            speaker_feature, user_verification = model(modal_ultrasound, modal_video)
            # fake_clean = model(noisy_imgs)
            time_end = time.time()

            print("inference time: ", time_end - time_start)
            print("average inference time per sample: ", (time_end - time_start)/speaker_feature.shape[0])


            output = speaker_feature.squeeze(dim=1).cpu().numpy()
            output2 = user_verification.squeeze(dim=1).cpu().numpy()
            
            print("output shape: ", output.shape)
            for j in range(output.shape[0]):
                if save_path_index >= len(sample_names):  # Check if we've reached the end of the dataset
                    break
                print(f"Processing sample {save_path_index + 1}/{len(sample_names)}: {sample_names[save_path_index]}")
           
                parts = sample_paths[save_path_index].split('\\')
                # folder1 = parts[-3]
                folder2 = parts[-2]
     
                # Create nested directory structure
                # nested_dir = os.path.join(output_path, folder1, folder2)
                nested_dir = os.path.join(output_path, folder2)
                os.makedirs(nested_dir, exist_ok=True)

                # Full output path with nested directories
                output_file = os.path.join(nested_dir, f"{sample_names[save_path_index]}.npy")
                output_file2 = os.path.join(nested_dir, f"{sample_names[save_path_index]}.txt")


                out = output[j] #存储speaker_feature特征
                out2 = output2[j]
                # out2 = 0 if out2 < 0.5 else 1

                np.save(output_file, out)
                with open(output_file2, "w") as file:
                    file.write(str(out2))
                # np.save(output_file2, out2)

                save_path_index += 1
    # del model, dataloader
    # gc.collect()
    # torch.cuda.empty_cache()

def main_test_CrossViT():
    # 创建模型
    T = 5
    batch_size = 4
    feature_dim = 512
    frames_per_sec = 10
    
    # Initialize model
    model = Stage2Model(feature_dim=feature_dim)
    test_model(model)

In [2]:
main_test_CrossViT()

dataset path:  E:\dataset\ultrasound_video_audio\DATA\dataset\test_dataset.csv
inference time:  0.5846295356750488
average inference time per sample:  0.0038462469452305846
output shape:  (152, 50, 512)
Processing sample 1/152: db_seg_0149_us
Processing sample 2/152: whh_seg_0257_us
Processing sample 3/152: syk_seg_0110_us
Processing sample 4/152: db_seg_0048_us
Processing sample 5/152: whh_seg_0145_us
Processing sample 6/152: pmz_seg_0290_us
Processing sample 7/152: whh_seg_0219_us
Processing sample 8/152: ldx_seg_0051_us
Processing sample 9/152: whh_seg_0272_us
Processing sample 10/152: whh_seg_0286_us
Processing sample 11/152: db_seg_0257_us
Processing sample 12/152: zyc_seg_0033_us
Processing sample 13/152: ldx_seg_0060_us
Processing sample 14/152: whh_seg_0170_us
Processing sample 15/152: whh_seg_0107_us
Processing sample 16/152: whh_seg_0264_us
Processing sample 17/152: syk_seg_0206_us
Processing sample 18/152: syk_seg_0055_us
Processing sample 19/152: syk_seg_0006_us
Processing 