In [6]:
pdf_name = "120_Rouiller_2004"

In [7]:
import json
import os

# read data from json file
fn = os.path.join('.', 'metadata_extraction', 'literature_metadata.json')
with open(fn, 'r') as f:
    data = f.read()
    
# parse json data
json_data = json.loads(data)

# iterate the keys in the json data, find the element contains the pdf name
flag_found = False
for key in json_data.keys():
    # print(key)
    if pdf_name in key:
        flag_found = True
        article_data = json_data[key]
        break
if flag_found == False:
    raise Exception("pdf name not found")

# read DOI_URL, Title, Injection and Labeling from article_data
DOI_URL = article_data["Article_Info"]["DOI-URL"]
Title = article_data["Article_Info"]["Title"]
Injection_and_Labeling = article_data["Injection_and_Labeling"]

# iterate the elements in injection_labeled_data, find all injection sites and labeled sites
injection_sites = []
labeled_sites = []
for injection in Injection_and_Labeling:
    injection_sites.append(injection["Injection"]["InjectionSite_by_Author"])
    for labeled_site in injection["LabeledSites"]:
        labeled_sites.append(labeled_site["LabeledSite_by_Author"])

# remove the duplicates in injection_sites
injection_sites = list(dict.fromkeys(injection_sites))
    
# remove the duplicates in labeled_sites and rank the labeled_site arphabetically
labeled_sites = list(dict.fromkeys(labeled_sites))
labeled_sites.sort()

# find sites that appear in both injection_sites and labeled_sites
both_sites = list(set(injection_sites) & set(labeled_sites))

# remove sites that appear in injection sites from labeled sites
for site in both_sites:
    if site in labeled_sites:
        labeled_sites.remove(site)
        
# merge the injection_sites and labeled_sites
list_of_sites = injection_sites + labeled_sites
        
# merge all elements in the list and seperate by \n into a string and write back to txt file
fn = os.path.join('.', 'metadata_extraction', pdf_name, '.txt')
with open('areas_to_map.txt', 'w') as f:
    f.write('\n'.join(list_of_sites))
    
print("Number of sites:", len(list_of_sites))
print("Number of injection sites:", len(injection_sites))
print("Number of labeled sites:", len(labeled_sites))

7
1
6


In [8]:
# write the data into csv file
import pandas as pd
import sys

csv_columns = ['Area name', 'Area name explained', 'Area type (Injection site/Labeled site/Both sites)', 'Relation (part of/same as/sum of)', 'Mapped area name', 'Mapping confidence (high/medium/low)', 'Mapping references', 'Mapped by (D/R/A)', 'Comment', 'Figures', 'PRs']

fn = os.path.join('.', 'mapping', pdf_name + '.csv')
data_frame = pd.read_csv(fn, header=None, sep="\t")

data_frame.columns = csv_columns

# the element of first row and second columns is assigned DOI_URL
data_frame.iloc[0, 1] = DOI_URL
data_frame.iloc[1, 1] = Title

for i in range(len(list_of_sites)):
    row = i + 10
    data_frame.loc[row, 'Area name'] = list_of_sites[i]
    data_frame.loc[row, 'Mapped by (D/R/A)'] = 'D'
    if list_of_sites[i] in both_sites:
        data_frame.loc[row, 'Area type (Injection site/Labeled site/Both sites)'] = 'Both sites'
    elif list_of_sites[i] in injection_sites:
        data_frame.loc[row, 'Area type (Injection site/Labeled site/Both sites)'] = 'Injection site'
    elif list_of_sites[i] in labeled_sites:
        data_frame.loc[row, 'Area type (Injection site/Labeled site/Both sites)'] = 'Labeled site'
    else:
        raise Exception("The site:", site, "is not in injection_sites or labeled_sites or both_sites")
    
# write the data into csv file
data_frame.to_csv(fn, index=False, header=False, sep="\t")

In [5]:
# # read string from txt file
# with open('injection_labeled_sites.txt', 'r') as f:
#     given_string = f.read()
    
# # separate string using \n
# list_of_sites = given_string.splitlines()

# # remove empty strings
# list_of_sites = [x for x in list_of_sites if x]

# # find all injection sites as a list that appear as the next element of "InjectionSite:"
# injection_site = []
# for i in range(len(list_of_sites)):
#     if list_of_sites[i] == 'InjectionSite:':
#         injection_site.append(list_of_sites[i+1])
        
# # remove the duplicates in the inection_site
# injection_site = list(dict.fromkeys(injection_site))
        
# # find all labeled sites as a list that appear as the next elements after "LabeledSites:" and before "Injection:" or end of the list
# labeled_site = []
# for i in range(len(list_of_sites)):
#     if list_of_sites[i] == 'LabeledSites:':
#         for j in range(i+1, len(list_of_sites)):
#             if list_of_sites[j] == 'Injection:':
#                 break
#             else:
#                 labeled_site.append(list_of_sites[j])

# # remove "R:" and "A:" in the labeld list
# for ele in labeled_site:
#     if ele == 'R:':
#         labeled_site.remove('R:')
#     elif ele == 'A:':
#         labeled_site.remove('A:')
#     else:
#         pass

# # remove the duplicates in the labeled_site
# labeled_site = list(dict.fromkeys(labeled_site))

# # rank the labeled_site arphabetically
# labeled_site.sort()

# # merge injection_site and labeled_site
# list_of_sites = injection_site + labeled_site

# print(list_of_sites)
# print(len(list_of_sites))
# print(len(injection_site))
# print(len(labeled_site))

# # merge all elements in the list and seperate by \n into a string and write back to txt file
# with open('areas_to_map.txt', 'w') as f:
#     f.write('\n'.join(list_of_sites))