## Extract Multi-modal Emotion Features

### Config Setting

In [None]:
import clip
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
# DATASET is the dataset name model trained on, e.g. ESD, RAF, MELD
DATASET = "ESD" # "RAF" "MELD" "MEADTTS"

PROJECT_PATH = os.path.join('/', *os.getcwd().split(os.sep)[:-2])
# PRETRAIN_MODEL is the pretrained model name, e.g. ViT-B/32
PRETRAIN_MODEL = "ViT-B/32"
# PRETRAIN_MODEL_PATH is the pretrained model path, e.g. EPAlign/ckpt/base
PRETRAIN_MODEL_PATH = f"{PROJECT_PATH}/EPAlign/ckpt/base"
# FINETUNE_MODEL is the finetuned model path, e.g. EPAlign/ckpt/ESD/best_model.pt
FINETUNE_MODEL = f"{PROJECT_PATH}/EPAlign/ckpt/{DATASET}/best_model.pt"
# EMO_FEATURE_SAVE_PATH is the path to save the extracted emotion features, e.g. EPAlign/mmefeature/ESD
EMO_FEATURE_SAVE_PATH = f"{PROJECT_PATH}/EPAlign/mmefeature/{DATASET}"
os.makedirs(EMO_FEATURE_SAVE_PATH, exist_ok=True)

### Load Model

In [None]:
model, preprocess = clip.load(PRETRAIN_MODEL, device=device, jit=False, download_root=PRETRAIN_MODEL_PATH)
model.load_state_dict(torch.load(FINETUNE_MODEL))

### Prompt Setting

In [None]:
# labels = ["normal neutral","normal calm","normal happy","normal sad","normal angry","normal fearful","normal disgust","normal surprised","strong clam","strong happy","strong sad","strong angry","strong fearful","strong disgust","strong surprised"]
labels = ["neutral", "happy", "sad", "angry", "surprise"]
# labels_old = ['angry_level_1',
#  'angry_level_2',
#  'angry_level_3',
#  'contempt_level_1',
#  'contempt_level_2',
#  'contempt_level_3',
#  'disgusted_level_1',
#  'disgusted_level_2',
#  'disgusted_level_3',
#  'fear_level_1',
#  'fear_level_2',
#  'fear_level_3',
#  'happy_level_1',
#  'happy_level_2',
#  'happy_level_3',
#  'neutral_level_1',
#  'sad_level_1',
#  'sad_level_2',
#  'sad_level_3',
#  'surprised_level_1',
#  'surprised_level_2',
#  'surprised_level_3']
# level2describe = {
#     "level_1": "slightly",
#     "level_2": "moderately",
#     "level_3": "extremely"
# }
# labels = [f"{level2describe[emotion.split('_')[1]+'_'+emotion.split('_')[2]]} {emotion.split('_')[0]}" for emotion in labels_old]
# prompt = [f'talk in {label} emotion', for label in labels]
prompt = [f'A person speaking with a feeling of {label}.' for label in labels]

### Extract & Save mmefeature

In [None]:
emo_prompt = clip.tokenize(prompt).to(device)

with torch.no_grad():
    emo_prompt_features = model.encode_text(emo_prompt)
    emo_prompt_features /= emo_prompt_features.norm(dim=-1, keepdim=True)

emo_prompt_features = emo_prompt_features.split(1, dim=0)

for i in range(len(emo_prompt_features)):
    # torch.save(emo_prompt_features[i].squeeze(), f"{emo_feature_save_path}/{tag[i].split(' ')[2]}and{tag[i].split(' ')[4]}.pt")
    torch.save(emo_prompt_features[i].squeeze(), f"{EMO_FEATURE_SAVE_PATH}/{labels[i].split(' ')[-1]}.pt")
    # torch.save(emo_prompt_features[i].squeeze(), f"{EMO_FEATURE_SAVE_PATH}/tmp/{labels_old[i]}.pt")

### Check Feature Shape

In [None]:
import os
for file in os.listdir(EMO_FEATURE_SAVE_PATH):
    efeature = torch.load(f"{EMO_FEATURE_SAVE_PATH}/{file}", map_location=torch.device('cpu'))
    if efeature.shape[0] != 512:
        print(file)