In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import NoSuchElementException, StaleElementReferenceException
from typing import Optional
import pickle
import csv
from selenium.webdriver.chrome.service import Service

In [4]:
import json
path = 'task1_train.jsonl'
data = []
with open(path, 'r') as fl:
    for line in fl:
        data.append(json.loads(line))

In [8]:
driver = webdriver.Edge()


In [9]:
def get_metadata(id: str):
    url = f'https://www.zora.uzh.ch/id/eprint/{id}/'
    print(f"navigating to {url}")
    driver.get(url)
    info = {}

    print("finding tables..")
    tables = driver.find_elements(By.CLASS_NAME, 'ep_block')
    print("finding rows..")
    table = tables[0]
    rows = table.find_elements(By.TAG_NAME, 'tr')

    # Iterate over the rows to extract the metadata.
    i = 0
    for row in rows:  # Skip the header row
        print(f"row {i} text: \'{row.text}\'.")
        # this is the key
        head = row.find_elements(By.TAG_NAME, 'th')[0].text
        # and this is the value
        cell = row.find_elements(By.TAG_NAME, 'td')[0].text
        info[head]=cell
        print(f"{head}: {cell}")
        i += 1
        # break
    return info

In [10]:
meta_data_fname = f'goals_metadata.json'

In [11]:
import os

if os.path.exists(meta_data_fname):
    with open(meta_data_fname, "r") as f:
        metas = json.load(f)
else:
    metas = {}
    
for i in range(len(data)):
    t = data[i]
    if metas.get(id) != None:
        continue
        
    id = t["ID"]
    actual_id = id.replace("oai:www.zora.uzh.ch:","")
    print(f"Processing paper {id} ({actual_id})")
    meta = get_metadata(actual_id)
    metas[id] = meta
    with open(meta_data_fname, "w") as fl:
        json.dump(metas, fl, indent=3)

In [12]:
len(metas)

430

In [13]:
# for each goal, gather the papers' categories that associate with that goal.

s = {}
for i in range(18):
    s[i]={}
    
for id, meta in metas.items():
    
    clf = [i for i in data if i["ID"] == id][0]
    sdg = int(clf["SDG"])
    
    for ml, mv in meta.items():
        if s[sdg].get(ml) is None:
            s[sdg][ml] = []
        
        b = mv.split('\n')
        for be in b:
            s[sdg][ml].append(be)


In [14]:
# now eliminate duplicate categories by turning the lists into sets.
for i in range(18):
    for ml, mvs in s[i].items():
        s[i][ml] = set(s[i][ml])

In [15]:
def my_add(w,k,v):
    if w.get(k) is None:
        w[k]={}
    
    w[k].append(v)

In [16]:
def rec_add(o, i, o2):
    k = o2[i]
    if o.get(k) is None:
        o[k]={}
        
    if i+1 >= len(o2):
        return o[k]
    else:
        return rec_add(o[k],i+1,o2)
    
def fill_for_goal(goal: int):
    G1 = s[goal]["Communities & Collections:"]
    G2 = s[goal]["Dewey Decimal Classification:"]
    G = list(G1)+list(G2)
    my_cats = {}
    for thing in G:
        parts = thing.split(' > ')
        rec_add(my_cats,0,parts)
    return my_cats

goals_categories = {}
for i in range(18):
    goals_categories[i] = fill_for_goal(i)


In [17]:
def rec_print(d: dict, i: str):
    next_i = i+"---"
    for k,v in d.items():
        print(i+k)
        rec_print(v,next_i)
def print_for_goal(goal: int):
    print(f"categories of papers of goal {goal}:")
    rec_print(goals_categories[goal],"")
print_for_goal(0)

categories of papers of goal 0:
03 Faculty of Economics
---Department of Informatics
---Department of Business Administration
---Department of Banking and Finance
---Center for Corporate Responsibility and Sustainability
---Department of Economics
---UBS Center for Economics in Society
07 Faculty of Science
---Institute of Molecular Cancer Research
---Department of Evolutionary Anthropology
---Zurich-Basel Plant Science Center
---Department of Chemistry
---Institute of Geography
---Institute of Molecular Life Sciences
---Physics Institute
---Department of Systematic and Evolutionary Botany
---Institute of Mathematics
---Department of Plant and Microbial Biology
---Institute of Pharmacology and Toxicology
---Institute for Computational Science
---Institute of Evolutionary Biology and Environmental Studies
06 Faculty of Arts
---Institute of Sociology
---Linguistic Research Infrastructure (LiRI)
---Institute of Psychology
---English Department
---Department of Communication and Media Rese

In [19]:
goals_cat_fname = "goals_categories.json"
with open(goals_cat_fname, "w") as f:
    json.dump(goals_categories, f, indent=3)