### Imports

In [None]:
import math
import json
import numpy as np
from timeit import default_timer as timer

from dblputils import IOUtil, CitationDataset
from dblpfeatures import DataUtil
from namedisambiguation_v1 import AuthorNameDisambiguation, MergeInfo

### Global variables

In [None]:
# Path names
version = "v11_reduced_300k"
LESS_COLS = "./datasets/v11/dblp_papers_v11_less-cols.txt"
DATA_NAME = "./datasets/"+version+"/dblp_papers_"+version+".txt"
AUTHORS = "./datasets/"+version+"/dblp_authors_"+version+".txt"
# as 'AUTHORS', but after AuthorNameDisambiguation
MERGED = "./datasets/"+version+"/dblp_authors_merged_"+version+".txt"

# simple file which contains all the publication ids and the list of their authors
CO_AUTH = "./datasets/"+version+"/data_structures/pub_ids_auths.txt"
# simple file which contains all the author ids and names
AUTHS_DICT = "./datasets/"+version+"/data_structures/auths_dict.txt"
# simple file which contains all the years that appear in the dataset and an ordinal number that represent each of them
YEARS_DICT = "./datasets/"+version+"/data_structures/years_dict.txt"
# simple file which contains all the raw venues that appear in the dataset and an ordinal number that represent each of them
VENUES_DICT = "./datasets/"+version+"/data_structures/venues_dict.txt"
# list of titles translated in english and cleared with clear_text() (title i is referred to publication i)
TITLES = "./datasets/"+version+"/data_structures/titles.txt"
# simple file which contains the list of ids of all publications for each author
PUBS = "./datasets/"+version+"/data_structures/auth_pubs.txt"

nd_version = "v1"
# contains a row for each author; each row is formed by an author's id and the list of the string-matching authors' ids
STEP1_DUPL = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_matches.txt"
# sparse matrix (author-publication) where a_ij=1 if the author i wrote the publication j, a_ij=0 otherwise
STEP2_Map = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_Map.npz"
# sparse matrix containing the similarity value based on meta-path APAPA for each pair of authors
STEP2_Mapapa = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_Mapapa.npz"
# sparse matrix containing the similarity value based on meta-path APYPA for each pair of authors
STEP2_May = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_May.npz"
# sparse matrix containing the similarity value based on meta-path APTPA for each pair of authors
STEP2_Mat = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_Mat.npz"
# sparse matrix containing the similarity value based on meta-path AVA for each pair of authors
STEP2_Mav = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_Mav.npz"
# contains a row for each author that has at least one possible duplicate; each row is formed by an author's id and the list of 
# similarity values (one for each of his possible duplicates)
STEP2_SIM = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_similarities_uniform.txt"
# contains a row for each author with matched duplicates; each row is formed by an author's id and the list of the matched
# authors' ids
STEP3_DUPL = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step3_duplicates.txt"
# dictionary where for each <key,value> pair the key is an author id from the complete authors dataset and the value
# is the same id if he's no one duplicate or his duplicate's id if he's someone duplicate
MERGED_IDMAP = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step3_idmap.txt"

In [None]:
# All the fields of a json object loaded from the dataset
all_columns = np.array(["id", "title", "authors", "venue", "year", "keywords", "fos", "references",
                        "n_citation", "page_start", "page_end", "doc_type", "lang", "publisher",
                        "volume", "issue", "issn", "isbn", "doi", "pdf", "url", "abstract", "indexed_abstract"])
# Reduced number of fields of a json objed loaded from the dataset
reduced_columns = np.array(["id", "title", "authors", "venue", "year", "references"])
# Columns of the authors dataset
author_columns = np.array(["id", "name", "org", "pubs"])
# authors['pubs'] inner keys
author_pub_columns = np.array(["id", "title", "year", "venue", "references"])
# total number of entries in the original dataset
dataset_lines = 1425148      #4107340
# total number of entries in the authors dataset
authors_lines = 1145383      #3655052

In [None]:
# Instances of some classes or methods
ioutil = IOUtil()
datautil = DataUtil()
authorNameDisambiguation = AuthorNameDisambiguation(ioutil)
mergeinfo = MergeInfo(ioutil)

### Operations on the original dataset

In [None]:
publications = CitationDataset("./datasets/v11/dblp_papers_v11.txt", 4107340)
# reducing the number of columns in the whole dataset
ioutil.dumpLinesFromJson(publications, LESS_COLS, reduced_columns)

In [None]:
publications = np.array(ioutil.loadAsJson(LESS_COLS))
# selecting 300000 random publications from the dataset
ioutil.selectRandomPro(publications, 300000, DATA_NAME)

In [None]:
auth_keys = ["name", "org"]
pub_keys = ["title", "year"]
pub_nested = {"venue":"raw"}
# Authors dict structure:
# dict := {<author_id>:<author_info>}
#   <author_id> := str
#   <author_info> := {"id":str, "name":str, "org":str, "pubs":list<pub_info>}
#     <pub_info> := {"id":str, "title":str, "year":str, "venue":str, "references":list<str>}
# extracting a dataset representation which focuses on authors instead of publications
publications = CitationDataset(DATA_NAME, dataset_lines)
authors = ioutil.extractAuthorsDataset(publications, auth_keys, pub_keys, pub_nested)
ioutil.dumpLinesFromJson(list(authors.values()), AUTHORS)

In [None]:
authors = CitationDataset(AUTHORS, authors_lines)
publications = CitationDataset(DATA_NAME, dataset_lines)
datautil.extractAuthors(authors, save=True, filename=AUTHS_DICT)
datautil.extractCoAuthors(publications, save=True, filename=CO_AUTH)
datautil.extractPublications(authors, save=True, filename=PUBS)
datautil.extractYears(publications, save=True, filename=YEARS_DICT)
datautil.extractVenues(publications, save=True, filename=VENUES_DICT)
specials = datautil.extractTitles(publications)

In [None]:
# referring to "300k" dataset
import langid
from googletrans import Translator
lang_predictor = langid.classify
lang_translator = Translator()

not_en = []
for i in range(len(specials)):
    if(lang_predictor(specials[i])[0]!='en'):
        not_en.append(i)
    if(i%10==0):
        print("\Classified: %d/%d"%(i,len(specials)),end='',flush=True)
print("\Classified: %d/%d"%(i+1,len(specials)))
en = list(set(np.arange(len(specials)))-set(not_en))
done = 0
for i in en:
    specials[i] = datautil.clear_title(specials[i])
    if(done%100==0):
        print("\rCorrected: %d/%d"%(done,len(en)),end='',flush=True)
    done += 1
print("\rCorrected: %d/%d"%(done,len(en)))
done = 0
for i in not_en:
    if(done<3622):
        done += 1
        continue
    specials[i] = datautil.clear_title(lang_translator.translate(specials[i]).text)
    if(done%100==0):
        print("\rCorrected: %d/%d"%(done,len(not_en)),end='',flush=True)
    done += 1
print("\rCorrected: %d/%d"%(done,len(not_en)))
copied = 0
for i in range(len(datautil.titles_list)):
    if(datautil.titles_list[i]=="+"):
        datautil.titles_list[i] = specials[copied]
        copied += 1
    if(i%10000==0):
        print("\rSecured: %d/%d"%(i,len(datautil.titles_list)),end='',flush=True)
print("\rSecured: %d/%d"%(i+1,len(datautil.titles_list)))
with open(TITLES, 'w', encoding='utf-8') as file:
    for i in range(len(datautil.titles_list)):
        file.write("%s%s"%(datautil.titles_list[i],("" if i==len(datautil.titles_list)-1 else "\n")))
        if(i%10000==0):
            print("\rLines written: %d/%d"%(i,len(datautil.titles_list)),end='',flush=True)
    print("\rLines written: %d/%d"%(i+1,len(datautil.titles_list)))

Read everything from here:

In [None]:
# Reading all the dictionaries
datautil.loadAttributes(authors=AUTHS_DICT, coAuthors=CO_AUTH, publications=PUBS, 
                        venues=VENUES_DICT, years=YEARS_DICT, titles=TITLES)
# Preparing convenient data structures for the next part
datautil.computeAuthorsInd()
datautil.computePublicationsInd()

### Name Disambiguation Problem
src: Ranking-Based Name Matching for Author Disambiguation in Bibliographic Data<br>
readaptation: 3 steps -> r-step, p-step, merge-step

***r-step***: *Improving the recall; given an author ID, one should find as many potential duplicates as possible via String-based consideration.*

In [None]:
# Step 1: for each author find all possible duplicates using RatcliffObershelp similarity as criteria. A list 
# of names is also stored: if a name is completely equal to one already seen (homonim), than the iteration is 
# avoided and the list of possible duplicates is copied from the homonim.
authorNameDisambiguation.r_step(datautil.authors_dict, STEP1_DUPL, start=527540) # +11681 mins

***p-step***: *Improving the precision; once finding potential duplicates for each candidate author name, a critical task is to infer the real author entity shared by one or more author IDs. In order to accomplish this task is used the idea of meta-paths [AVA (same venue), APAPA (co-authors of my co-authors), APTPA (title similarities), APYPA (same year)].*

In [None]:
# Step 2.0: computing the adjacency matrix Map (author-publication) where a_ij=1 if the author i wrote the 
# publication j, a_ij=0 otherwise
authorNameDisambiguation.computeMap(datautil.auth_id_ind, datautil.pub_id_ind, datautil.publications_dict, filename=STEP2_Map)
# Step 2.1: computing meta-path APAPA for each author (applying l2-norm)
authorNameDisambiguation.computeMapapa(filename=STEP2_Mapapa)
# Step 2.2: computing meta-path APYPA for each author (applying l2-norm)
authorNameDisambiguation.computeMay(CitationDataset(DATA_Name, dataset_lines), datautil.pub_id_ind, datautil.years_dict, 
                                    filename=STEP2_May)
# Step 2.3: computing meta-path APTPA for each author (applying l2-norm)
authorNameDisambiguation.computeMat(datautil.titles_list, filename=STEP2_Mat)
# Step 2.4: computing meta-path AVA for each author (applying PathSim)
authorNameDisambiguation.computeMav(CitationDataset(AUTHORS, authors_lines), datautil.venues_dict, datautil.auth_id_ind, 
                                    filename=STEP2_Mav)
# Step 2.5: computing the sum of similarity value of each meta-path for each author that has at least one possible duplicate
authorNameDisambiguation.p_step(np.array([1/4]*4), datautil.auth_id_ind, filename=STEP2_SIM)

In [None]:
# RUN FROM HERE
authorNameDisambiguation.loadPossibleDuplicates(STEP1_DUPL)
authorNameDisambiguation.loadMatrices(Mapapa=STEP2_Mapapa, May=STEP2_May, Mat=STEP2_Mat, Mav=STEP2_Mav)
authorNameDisambiguation.p_step(np.array([1/4]*4), datautil.auth_id_ind, filename=STEP2_SIM)

***m-step***: *Merging the resulting duplicates, based on the similarity values obtained in the r-step and the p-step.*

In [None]:
# Step 3: finding the actual duplicates for each author that passed the first step, i.e. the possible duplicate authors 
# with a similarity value (meta-path based) over a threshold
authorNameDisambiguation.m_step(filename=STEP3_DUPL)

In [None]:
# saving the dataset with the merged authors
authors = ioutil.loadAsJson(AUTHORS)
tmp = dict()
for entry in authors:
    tmp[entry['id']] = entry
authors = tmp
del tmp
mergeinfo.computeMergeDataset(authors, authorNameDisambiguation.duplicates, filename=MERGED)
mergeinfo.computeIdMap(CitationDataset(AUTHORS, authors_lines), authorNameDisambiguation.duplicates, filename=MERGED_IDMAP)