In [1]:
import os
import json
from pathlib import Path
import urllib
from tqdm import tqdm
import paths

In [2]:
blink_data_folder_path = paths.OUTPUT_DATASETS_PATH / 'melart_blink'

In [3]:
candidates_folder = paths.CANDIDATES_FOLDER_PATH

In [4]:
# read all the candidate json files from el_candidates and put them in a dictionary
candidate_counter=0
candidate_objects=[]
candidate_index={}
qid2id={}
for file in tqdm(list(candidates_folder.iterdir())):
    if not file.name.endswith('.json'):
        continue
    with open(file, 'r') as f:
        candidate = json.load(f)
        qid = file.name.split('.')[0]
        candidate_types_dict=candidate.get('types', dict())
        types_label_list=candidate_types_dict.values()
        """
        build an object like this
        {
            "title": "Elon Musk",
            "text": "Elon Reeve Musk (; born June 28, 1971) is an entrepreneur and business magnate. He is the founder, CEO and chief engineer at SpaceX; early stage investor, CEO, and product architect of Tesla, Inc.; founder of The Boring Company; and co-founder of Neuralink and OpenAI. A centibillionaire, Musk is one of the richest people in the world.\nMusk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. He transferred to the University of Pennsylvania two years later, where he received bachelors' degrees in economics and physics. He moved to California in 1995 to attend Stanford University but decided instead to pursue a business career, co-founding",
            "document_id": 909036
        }
        """
        obj={}
        obj['document_id']=int(qid[1:])
        obj['title']=candidate['labels']['main'] if candidate['labels'].get('main') else ''
        if not obj['title']:
            #try the first alternative label
            obj['title']=candidate['labels']['alt'][0] if candidate['labels'].get('alt') and len(candidate['labels']['alt'])>0 else ''
        desc_text=candidate.get('description', "")
        entity_types=types_label_list
        types_str=", ".join(entity_types)
        obj['text']=f"{obj['title']}. {desc_text}. Types: {types_str}"
        obj['type']=types_str
        candidate_objects.append(obj)
        candidate_index[qid]=obj
        qid2id[qid]=candidate_counter
        candidate_counter+=1

qid2line_number={}

# write the candidate objects to a json file
documents_folder_path = blink_data_folder_path / 'documents'
documents_folder_path.mkdir(parents=True, exist_ok=True)
counter=0
with open(documents_folder_path / 'documents.jsonl', 'w') as f:
    for obj in candidate_objects:
        qid2line_number[obj['document_id']]=counter
        f.write(json.dumps(obj))
        f.write('\n')
        counter+=1

100%|██████████| 53901/53901 [00:01<00:00, 45729.42it/s]


In [5]:
candidate_counter

53901

Read the file with the paintings and the sentence mentions

In [6]:
paintings_file = paths.COMBINED_ANNOTATIONS_PATH
paintings=None
with open(paintings_file, 'r') as f:
    paintings = json.load(f)

In [7]:
painting_images_path = paths.MELART_IMAGES_PATH

In [8]:
mention_objects=[]

paintings_mentions_index={}

for qid,paiting_obj in tqdm(paintings.items()):
    counter=0
    img_url=paiting_obj.get("img_url",None)
    new_img_file_path=None
    if img_url: # this check is to make the MIMIC and BLINK data compatible
        img_file_name=img_url.split('/')[-1]
        img_path=Path(urllib.parse.unquote(img_file_name))
        img_path= painting_images_path / img_path
        if not img_path.exists():
            print(f'no image in dict for {qid} ({img_file_name})')
            continue
    else:
        print(f'no image for {qid}')
        continue
    for field in ["visual_el_matches","contextual_el_matches"]:
        for i,el_matches in enumerate(paiting_obj[field]):
            if len(el_matches)>0:
                sentence=paiting_obj["visual_sentences"][i] if field=="visual_el_matches" else paiting_obj["contextual_sentences"][i]
                for el_match in el_matches:
                    match_qid=el_match['qid'].split('/')[-1]
                    #match_id=int(match_qid[1:])
                    obj={}
                    try:
                        obj["label_id"]=qid2id[match_qid]
                    except:
                        print(f'no candidate for {qid} trying to match {match_qid}')
                        continue
                    obj['mention']=el_match['text']
                    obj['label']=candidate_index[match_qid]['text']
                    obj['label_title']=candidate_index[match_qid]['title']
                    start_index=el_match["start"]
                    end_index=el_match["end"]
                    left_context=sentence[:start_index]
                    right_context=sentence[end_index:]
                    obj['context_left']=left_context
                    obj['context_right']=right_context
                    obj['world']="undefined"
                    mention_objects.append(obj)
                    painting_mentions=paintings_mentions_index.get(qid,[])
                    painting_mentions.append(obj)
                    paintings_mentions_index[qid]=painting_mentions

100%|██████████| 1616/1616 [00:00<00:00, 22549.27it/s]


In [9]:
# split the paintings according to the split property in the painting objects
train_paintings = {}
dev_paintings = {}
test_paintings = {}

for qid, painting_obj in paintings.items():
    if painting_obj['split'] == 'train':
        train_paintings[qid] = painting_obj
    elif painting_obj['split'] == 'val':
        dev_paintings[qid] = painting_obj
    elif painting_obj['split'] == 'test':
        test_paintings[qid] = painting_obj

# report sizes
print(f'train: {len(train_paintings)}, dev: {len(dev_paintings)}, test: {len(test_paintings)}')


train: 1188, dev: 328, test: 100


In [10]:
# write 3 json files called MELART_train.json, MELART_dev.json, MELART_test.json inside the MELART folder, using the paintings_mentions_index
blink_format_folder_path = blink_data_folder_path / 'blink_format'
blink_format_folder_path.mkdir(parents=True, exist_ok=True)
with open(blink_format_folder_path / 'train.jsonl', 'w') as f:
    #find all mentions for the train paintings
    train_mentions=[]
    counter=0
    for qid,painting_obj in train_paintings.items():
        mentions=paintings_mentions_index.get(qid,[])
        if len(mentions)>0:
            train_mentions.extend(mentions)
            counter+=1
    print(f'found {counter} paintings with mentions for train')
    for mention in train_mentions:
        f.write(json.dumps(mention))
        f.write('\n')
with open(blink_format_folder_path / 'valid.jsonl', 'w') as f:
    #find all mentions for the dev paintings
    dev_mentions=[]
    counter=0
    for qid,painting_obj in dev_paintings.items():
        mentions=paintings_mentions_index.get(qid,[])
        if len(mentions)>0:
            dev_mentions.extend(mentions)
            counter+=1
    print(f'found {counter} paintings with mentions for dev')
    for mention in dev_mentions:
        f.write(json.dumps(mention))
        f.write('\n')
with open(blink_format_folder_path / 'test.jsonl', 'w') as f:
    #find all mentions for the test paintings
    test_mentions=[]
    counter=0
    for qid,painting_obj in test_paintings.items():
        mentions=paintings_mentions_index.get(qid,[])
        if len(mentions)>0:
            test_mentions.extend(mentions)
            counter+=1
    print(f'found {counter} paintings with mentions for test')
    for mention in test_mentions:
        f.write(json.dumps(mention))
        f.write('\n')


found 1188 paintings with mentions for train
found 328 paintings with mentions for dev
found 100 paintings with mentions for test


In [11]:
#print sizes
print(f'train: {len(train_mentions)}, dev: {len(dev_mentions)}, test: {len(test_mentions)}')

train: 4632, dev: 1308, test: 645
