-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
142 lines (103 loc) · 4.63 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
from ast import literal_eval
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from PIL import Image
class MemeExpDataset(Dataset):
def __init__(self, dataset_path, tokenizer, transform, image_dir, ftr_dir):
self.dataset_path = dataset_path
self.tokenizer = tokenizer
self.transform = transform
self.image_dir = image_dir
self.ftr_dir = ftr_dir
self.data = pd.read_csv(dataset_path)
self.embeddings = []
for idx in range(len(self.data)):
ftr_path = os.path.join(self.ftr_dir, f"{idx}.pt")
ftr = torch.load(ftr_path)
self.embeddings.append(ftr)
def __len__(self):
return len(self.data)
# return 10
def get_encoded_text(self, text: str):
encoded_inputs = self.tokenizer(
text,
padding="max_length",
max_length=80,
truncation=True,
return_tensors='pt'
)
input_ids = encoded_inputs['input_ids'].squeeze()
attention_mask = encoded_inputs['attention_mask'].squeeze()
token_type_ids = encoded_inputs['token_type_ids'].squeeze()
return input_ids, attention_mask, token_type_ids
def __getitem__(self, idx):
ocr_text = ' '.join(self.data.iloc[idx]['ocr_text'].split('\n'))
ocr_text = ocr_text if isinstance(ocr_text, str) is True else ""
texts = literal_eval(self.data.iloc[idx]['sentences'])
texts = [text if isinstance(text, str) is True else "" for text in texts]
kg_embs = self.embeddings[idx]
# Image data
image_name = self.data.iloc[idx]['image']
image_path = os.path.join(self.image_dir, image_name)
image = Image.open(image_path).convert('RGB')
image = np.array(image)
if self.transform is not None:
image = self.transform(image=image)['image']
# Encoded ocr text
input_ids, attention_mask, token_type_ids = self.get_encoded_text(ocr_text)
# Encoded context texts
encoded_texts = [self.get_encoded_text(text) for text in texts]
context_input_ids = [input_ids for input_ids, attention_mask, _ in encoded_texts]
context_attention_mask = [attention_mask for input_ids, attention_mask, _ in encoded_texts]
# Output label
label = literal_eval(self.data.iloc[idx]['labels'])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids,
'context_input_ids': context_input_ids,
'context_attention_mask': context_attention_mask,
'label': label,
'image': image,
'num_sents': len(encoded_texts),
'kg_embs': kg_embs,
}
def collate_fn(batch):
input_size = list(batch[0]['input_ids'].shape)
dummy_ids = torch.zeros(input_size).int()
max_num_sents = max([item['num_sents'] for item in batch])
# Pad the context inputs
ctx_input_ids_list = []
ctx_attention_mask_list = []
for item in batch:
_pad_ids = [dummy_ids] * (max_num_sents - item['num_sents'])
_input_ids = torch.stack(item['context_input_ids'] + _pad_ids)
_attention_mask = torch.stack(item['context_attention_mask'] + _pad_ids)
ctx_input_ids_list.append(_input_ids)
ctx_attention_mask_list.append(_attention_mask)
txt_input_ids = torch.stack([item['input_ids'] for item in batch])
txt_attention_mask = torch.stack([item['attention_mask'] for item in batch])
token_type_ids = torch.stack([item['token_type_ids'] for item in batch])
ctx_input_ids = torch.stack(ctx_input_ids_list)
ctx_attention_mask = torch.stack(ctx_attention_mask_list)
num_sents = [item['num_sents'] for item in batch]
image = torch.stack([item['image'] for item in batch])
kg_embs = torch.stack([item['kg_embs'] for item in batch])
labels = [item['label'] for item in batch]
return {
'input_ids': txt_input_ids,
'attention_mask': txt_attention_mask,
'token_type_ids': token_type_ids,
'ctx_input_ids': ctx_input_ids,
'ctx_attention_mask': ctx_attention_mask,
'num_sents': num_sents,
'image': image,
'label': labels,
'kg_embs': kg_embs
}