In [None]:
import json
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
DATA_PATH = Path("data", "bibliometric_dataset")

In [None]:
pubs = []
with open(Path(DATA_PATH, "ai_dataset.jsonl"), 'r') as f:
    for line in tqdm(f, desc="Reading dataset ..."):
        pubs.append(json.loads(line))

authors = []
with open(Path(DATA_PATH, "persons_matched.jsonl"), 'r') as f:
    for idx, line in enumerate(tqdm(f, desc="Loading authors ...")):
        authors.append(json.loads(line))

schol_affi = [x for x in authors if 'schol_affiliations' in x and x['schol_affiliations'] is not None]
dblp_affi = [x for x in authors if 'note' in x and x['note'] is not None]

print(f"Having {len(schol_affi)} authors with schol affiliations and {len(dblp_affi)} authors with dblp affiliations.")

In [None]:
source = Path(DATA_PATH, 'ai_dataset.jsonl')
citation_path = Path(DATA_PATH, 'semantic_citations.csv')
target = Path(DATA_PATH, 'ai_dataset.tmp.jsonl')

citations = defaultdict(list)
with open(citation_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="\tReading semantic_scholar citations ..."):
        line = line.split(",")
        citations[line[0].strip()].append(line[1].strip())

pubs = []
with open(source, "r", encoding="utf-8") as f:
    for line in f:
        pubs.append(json.loads(line))
corpusids = {x['corpusid']: i for i, x in enumerate(pubs) if 'corpusid' in x}

with open(target, 'w', encoding="utf-8") as out_f:
    for pub in tqdm(pubs, desc="\tAdding citations ..."):
        if pub['key'] in citations:
            pub['citations'] = citations[pub['key']]
        out_f.write(json.dumps(pub) + "\n")

In [None]:
## Change DBLP affiliations to list

persons = []
with open(Path(DATA_PATH, "persons_matched.jsonl"), 'r') as f:
    for idx, line in enumerate(tqdm(f, desc="Loading authors ...")):
        line = json.loads(line)
        line['id'] = idx
        if isinstance(line['author'], str):
            line['author'] = [line['author']]
        persons.append(line)

country_domains = dict()
with open(Path(DATA_PATH, "country_domains.csv"), 'r') as f:
    for line in f:
        curr = [x.strip() for x in line.split(";")]
        country_domains[curr[0]] = curr[0]
        country_domains[curr[1]] = curr[0]
        

In [None]:
all_countries = set()
for person in tqdm(persons, desc="Changing affiliations ..."):
    if 'note' in person and person['note'] is not None:
        affi = []
        countries = set()
        if isinstance(person['note'], str):
            affi.append(person['note'])
        elif isinstance(person['note'], list):
            for x in person['note']:
                if isinstance(x, str):
                    affi.append(x)
        person['dblp_affiliations'] = affi
        for aff in affi:
            splitted = aff.split(",")[-1].strip()
            if splitted in country_domains:
                countries.add(country_domains[splitted])
                all_countries.add(country_domains[splitted])
        person['countries'] = list(countries)
        del person['note']


In [None]:
with open(Path(DATA_PATH, "persons_matched.tmp.jsonl"), 'w') as f:
    for person in tqdm(persons, desc="Writing authors ..."):
        f.write(json.dumps(person) + "\n")

In [None]:
import requests

def get_geocodes(address):
    endpoint = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": address,
        "format": "json",
        "limit": 1
    }
    response = requests.get(endpoint, params=params)
    try:
        data = response.json()
        if len(data) == 0:
            return (address, None, None)
        name = data[0]["display_name"]
        lat = data[0]["lat"]
        lon = data[0]["lon"]
        return (name, lat, lon)
    except:
        return print(response.text)
        exit(1)
        

In [None]:
country_domains = dict()
with open(Path(DATA_PATH, "country_domains.csv"), 'r') as f:
    for line in f:
        curr = [x.strip() for x in line.split(";")]
        country_domains[curr[0]] = curr[0]
        country_domains[curr[1]] = curr[0]

country_to_coord = {}
country_long = set(country_domains.values())
for country in country_long:
    name, lat, lon = get_geocodes(country)
    print(f"{country}: {lat}, {lon}")
    country_to_coord[country] = (lat, lon)

with open(Path(DATA_PATH, "country_to_coord.json"), 'w') as f:
    f.write(json.dumps(country_to_coord))

In [None]:
affiliations = []
with open(Path(DATA_PATH, "dblp_affiliations.txt"), 'r') as f:
    for line in f:
        affiliations.append(line.strip())

with open(Path(DATA_PATH, "dblp_affiliations_coords.jsonl"), 'a') as f:
    for aff in tqdm(affiliations[14896:], desc="Getting geocodes ..."): 
        curr = aff.strip().split(",")
        if len(curr) > 1:
            first_try = curr[0].strip()
            name, lat, lon = get_geocodes(curr[0].strip())
            if lat is not None:
                f.write(json.dumps({"affiliation": aff, "openstreetname": name, "lat": lat, "lon": lon}) + "\n")
            else:
                name, lat, lon = get_geocodes(curr[1].strip())
                if lat is not None:
                    name, lat, lon = get_geocodes(curr[1].strip())
                    f.write(json.dumps({"affiliation": aff, "openstreetname": name, "lat": lat, "lon": lon}) + "\n")
        else:
            name, lat, lon = get_geocodes(curr[0].strip())
            if lat is not None:
                f.write(json.dumps({"affiliation": aff, "openstreetname": name, "lat": lat, "lon": lon}) + "\n")

In [None]:
# Change Deepwalk embeddings to jsonl

graph_embeddings = {}
with open(Path(DATA_PATH, "graph.embeddings"), 'r') as f:
    for line in tqdm(f, desc="Loading embeddings ..."):
        curr = line.split(" ")
        graph_embeddings[int(curr[0])] = [float(x) for x in curr[1:]]

authors = {}
with open(Path(DATA_PATH, "persons_matched.jsonl"), 'r') as f:
    for line in tqdm(f, desc="Loading authors ..."):
        curr = json.loads(line)
        authors[curr['id']] = curr['author'][0]

counter = 0
with open(Path(DATA_PATH, "graph_embeddings.jsonl"), 'w') as f:
    for idx, emb in tqdm(graph_embeddings.items(), desc="Writing embeddings ..."):
        if idx in authors:
            f.write(json.dumps({"author": authors[idx], "embedding": emb}) + "\n")
        else:
            counter += 1

counter 

In [51]:
## Analyse saved hypothesis matrices

import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path
DATA_PATH = Path("data", "matrices")

hypothesis = csr_matrix(np.load(Path(DATA_PATH, "prev_co_authors.npy"), allow_pickle=True).item())


In [52]:
hypothesis[0].A

array([[nan, nan, nan, ..., nan, nan, nan]])