-
Notifications
You must be signed in to change notification settings - Fork 1
/
inference.py
81 lines (73 loc) · 2.72 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from exmcr.exmcr_model import Ex_MCR
from exmcr.exmcr_model import ModalityType, MCRType
import torch
input = {ModalityType.VISION: ['assets/toilet.jpeg',
'assets/BBQ.jpeg',
'assets/train.jpeg'],
ModalityType.TEXT: ['a toilet',
'BBQ',
'a train'],
ModalityType.AUDIO:['assets/toilet.wav',
'assets/BBQ.wav',
'assets/train.wav'],
ModalityType.PC:['assets/toilet.npy',
'assets/BBQ.npy',
'assets/train.npy']
}
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = Ex_MCR(device=device)
# you can get single modality embeddings by using these functions
# v_emb = model.get_vision_embedding(input)
# t_emb = model.get_text_embedding(input)
# a_emb = model.get_audio_embedding(input)
# p_emb = model.get_3d_embedding(input)
embeddings = model.get_embeddings_in_base_mcr(input)
print(
"Vision x Text:\n",
torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T * 10.0, dim=-1)
)
print(
"Audio x Text:\n",
torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T * 10.0, dim=-1)
)
print(
"3D x VISION:\n",
torch.softmax(embeddings[ModalityType.PC] @ embeddings[ModalityType.VISION].T * 10.0, dim=-1)
)
print(
"Audio x Vision:\n",
torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.VISION].T * 10.0, dim=-1)
)
print(
"3D x Text:\n",
torch.softmax(embeddings[ModalityType.PC] @ embeddings[ModalityType.TEXT].T * 10.0, dim=-1)
)
print(
"Audio x 3D:\n",
torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.PC].T * 10.0, dim=-1)
)
# Expected output
# Vision x Text:
# tensor([[0.5737, 0.2242, 0.2021],
# [0.2454, 0.4756, 0.2790],
# [0.2541, 0.2530, 0.4929]], device='cuda:3')
# Audio x Text:
# tensor([[0.9115, 0.0369, 0.0516],
# [0.1443, 0.7265, 0.1292],
# [0.1000, 0.0960, 0.8040]], device='cuda:3')
# 3D x VISION:
# tensor([[0.8635, 0.0850, 0.0515],
# [0.0761, 0.9125, 0.0114],
# [0.1239, 0.0781, 0.7980]], device='cuda:3')
# Audio x Vision:
# tensor([[0.9060, 0.0329, 0.0611],
# [0.0743, 0.8742, 0.0515],
# [0.0648, 0.0227, 0.9126]], device='cuda:3')
# 3D x Text:
# tensor([[0.8052, 0.0909, 0.1039],
# [0.2447, 0.6035, 0.1518],
# [0.1757, 0.2094, 0.6149]], device='cuda:3')
# Audio x 3D:
# tensor([[0.9860, 0.0038, 0.0102],
# [0.0078, 0.9875, 0.0048],
# [0.0204, 0.0021, 0.9775]], device='cuda:3')