### Imports

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sn

from dblputils import IOUtil, CitationDataset
from dblpfeatures import DataUtil
from namedisambiguation_v2 import AuthorNameDisambiguation, MergeInfo
from citerank import CitationGraph, CiteRank

### Global variables

Path names:

In [None]:
# datasets
version = "v11_reduced"
# the name of the dataset
DATA_NAME = "./datasets/"+version+"/dblp_papers_"+version+".txt"
# a dataset which represents authors instead of publications
AUTHORS = "./datasets/"+version+"/dblp_authors_"+version+".txt"
# as 'AUTHORS', but after AuthorNameDisambiguation
MERGED = "./datasets/"+version+"/dblp_authors_merged_"+version+".txt"

# data structures
# simple file which contains all the publication ids and the list of their authors
CO_AUTH = "./datasets/"+version+"/data_structures/pub_ids_auths.txt"
# simple file which contains all the author ids and names
AUTHS_DICT = "./datasets/"+version+"/data_structures/auths_dict.txt"
# simple file which contains all the years that appear in the dataset and an ordinal number that represent each of them
YEARS_DICT = "./datasets/"+version+"/data_structures/years_dict.txt"
# simple file which contains all the raw venues that appear in the dataset and an ordinal number that represent each of them
VENUES_DICT = "./datasets/"+version+"/data_structures/venues_dict.txt"
# list of titles translated in english and cleared with clear_text() (title i is referred to publication i)
TITLES = "./datasets/"+version+"/data_structures/titles.txt"
# simple file which contains the list of ids of all publications for each author
PUBS = "./datasets/"+version+"/data_structures/auth_pubs.txt"

# author name disambiguation
nd_version = "v2"
# sparse matrix (author-publication) where a_ij=1 if the author i wrote the publication j, a_ij=0 otherwise
STEP1_Map = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Map.npz"
# sparse matrix containing the similarity value based on meta-path APAPA for each pair of authors
STEP1_Mapapa = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Mapapa.npz"
# sparse matrix containing the similarity value based on meta-path APYPA for each pair of authors
STEP1_May = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_May.npz"
# sparse matrix containing the similarity value based on meta-path APTPA for each pair of authors
STEP1_Mat = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Mat.npz"
# sparse matrix containing the similarity value based on meta-path AVA for each pair of authors
STEP1_Mav = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Mav.npz"
# contains the diagonal of the matrix obtained computing the dot product between Mav and Mav.T
STEP1_Mav_Diag = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Mav_Diagonal.npy"
# sparse matrix containing the final similarity value of each pair of authors
STEP1_SIM = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step1_Sims.npz"
# contains a key for each author that has at least one possible duplicate (resulting from step 1); the value is a list of 
# pairs <id,similarity_value> (one for each of possible duplicates of the key)
STEP2_SIM = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step2_similarities.txt"
# contains a row for each author with matched duplicates; each row is formed by an author's id and the list of the matched
# authors' ids
STEP3_DUPL = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step3_duplicates.txt"
# dictionary where for each <key,value> pair the key is an author id from the complete authors dataset and the value
# is the same id if he's no one duplicate or his duplicate's id if he's someone duplicate
MERGED_IDMAP = "./datasets/"+version+"/name_disambiguation/"+nd_version+"/step3_idmap.txt"

# CiteRank
cg_version = "v1"
# a[i,j]=1 if a link exists from i to j, 0 otherwise (for each i and j)
ADJ_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/adjacency_matrix.npz"
# a[i,j]=k where k is the number of collaborations between i and j (for each i and j)
COL_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/collaboration_matrix.npz"
# a[i,j]=k where k is the number of existing links from i to j (for each i and j)
CIT_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/citation_matrix.npz"
Wcol_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/Wco_matrix.npz"
Wcit_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/Wcit_matrix.npz"
Wlp_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/Wlp_matrix.npz"
Worg_MATRIX = "./datasets/"+version+"/citations_graph/"+cg_version+"/Worg_matrix.npz"
W_MATRIX_LOC = "./datasets/"+version+"/citations_graph/"+cg_version+"/weight_matrices/"
RANKS_LOC = "./datasets/"+version+"/citations_graph/"+cg_version+"/ranks/"

In [None]:
# All the fields of a json object loaded from the dataset
all_columns = np.array(["id", "title", "authors", "venue", "year", "keywords", "fos", "references",
                        "n_citation", "page_start", "page_end", "doc_type", "lang", "publisher",
                        "volume", "issue", "issn", "isbn", "doi", "pdf", "url", "abstract", "indexed_abstract"])
# Reduced number of fields of a json objed loaded from the dataset
reduced_columns = np.array(["id", "title", "authors", "venue", "year", "references"])
# Columns of the authors dataset
author_columns = np.array(["id", "name", "org", "pubs"])
# authors['pubs'] inner keys
author_pub_columns = np.array(["id", "title", "year", "venue", "references"])
# total number of entries in the original dataset
dataset_lines = 1425148      #4107340
# total number of entries in the authors dataset
authors_lines = 1145383      #3655052

In [None]:
# Instances of some classes
ioutil = IOUtil()
datautil = DataUtil()
authorNameDisambiguation = AuthorNameDisambiguation(ioutil)
mergeinfo = MergeInfo(ioutil)
citegraph = CitationGraph()
citerank = CiteRank()

### Operations on the original dataset

In [None]:
publications = CitationDataset("./datasets/v11/dblp_papers_v11.txt", 4107340)
# reducing the number of columns in the whole dataset
ioutil.dumpLinesFromJson(publications, "./datasets/v11/dblp_papers_v11_less-cols.txt", reduced_columns)

In [None]:
publications = np.array(ioutil.loadAsJson("./datasets/v11/dblp_papers_v11_less-cols.txt"))
# selecting 300000 random publications from the dataset
ioutil.selectRandomPro(publications, 300000, DATA_NAME)

In [None]:
auth_keys = ["name", "org"]
pub_keys = ["title", "year"]
pub_nested = {"venue":"raw"}
# Authors dict structure:
# dict := {<author_id>:<author_info>}
#   <author_id> := str
#   <author_info> := {"id":str, "name":str, "org":str, "pubs":list<pub_info>}
#     <pub_info> := {"id":str, "title":str, "year":str, "venue":str, "references":list<str>}
# extracting a dataset representation which focuses on authors instead of publications
publications = CitationDataset(DATA_NAME, dataset_lines)
authors = ioutil.extractAuthorsDataset(publications, auth_keys, pub_keys, pub_nested)
ioutil.dumpLinesFromJson(list(authors.values()), AUTHORS)

In [None]:
authors = CitationDataset(AUTHORS, authors_lines)
publications = CitationDataset(DATA_NAME, dataset_lines)
datautil.extractAuthors(authors, save=True, filename=AUTHS_DICT)
datautil.computeAuthorsInd()
datautil.extractCoAuthors(publications, save=True, filename=CO_AUTH)
datautil.computePublicationsInd()
datautil.extractPublications(authors, save=True, filename=PUBS)
datautil.extractYears(publications, save=True, filename=YEARS_DICT)
datautil.extractVenues(publications, save=True, filename=VENUES_DICT)
specials = datautil.extractTitles(publications)

In [None]:
# referring to reduces dataset, may change if the dataset changes
import langid
from googletrans import Translator
lang_predictor = langid.classify
lang_translator = Translator()

not_en = []
for i in range(len(specials)):
    if(lang_predictor(specials[i])[0]!='en'):
        not_en.append(i)
    if(i%10==0):
        print("\Classified: %d/%d"%(i,len(specials)),end='',flush=True)
print("\Classified: %d/%d"%(i+1,len(specials)))
en = list(set(np.arange(len(specials)))-set(not_en))
done = 0
for i in en:
    specials[i] = datautil.clear_title(specials[i])
    if(done%100==0):
        print("\rCorrected: %d/%d"%(done,len(en)),end='',flush=True)
    done += 1
print("\rCorrected: %d/%d"%(done,len(en)))
done = 0
for i in not_en:
    if(done<3622):
        done += 1
        continue
    specials[i] = datautil.clear_title(lang_translator.translate(specials[i]).text)
    if(done%100==0):
        print("\rCorrected: %d/%d"%(done,len(not_en)),end='',flush=True)
    done += 1
print("\rCorrected: %d/%d"%(done,len(not_en)))
copied = 0
for i in range(len(datautil.titles_list)):
    if(datautil.titles_list[i]=="+"):
        datautil.titles_list[i] = specials[copied]
        copied += 1
    if(i%10000==0):
        print("\rSecured: %d/%d"%(i,len(datautil.titles_list)),end='',flush=True)
print("\rSecured: %d/%d"%(i+1,len(datautil.titles_list)))
with open(TITLES, 'w', encoding='utf-8') as file:
    for i in range(len(datautil.titles_list)):
        file.write("%s%s"%(datautil.titles_list[i],("" if i==len(datautil.titles_list)-1 else "\n")))
        if(i%10000==0):
            print("\rLines written: %d/%d"%(i,len(datautil.titles_list)),end='',flush=True)
    print("\rLines written: %d/%d"%(i+1,len(datautil.titles_list)))

### Name Disambiguation Problem
src: Ranking-Based Name Matching for Author Disambiguation in Bibliographic Data<br>
readaptation: 3 steps -> p-step, r-step, merge-step

***r-step***: *Improving the recall. In order to accomplish this task is used the idea of meta-paths [AVA (same venue), APAPA (co-authors of my co-authors), APTPA (title similarities), APYPA (same year)].*

In [None]:
# Step 1.0: computing the adjacency matrix Map (author-publication) where a_ij=1 if the author i wrote the 
# publication j, a_ij=0 otherwise
authorNameDisambiguation.computeMap(datautil.auth_id_ind, datautil.pub_id_ind, datautil.publications_dict, filename=STEP1_Map)
# Step 1.1: computing meta-path APAPA for each author (applying l2-norm)
authorNameDisambiguation.computeMapapa(filename=STEP1_Mapapa)
# Step 1.2: computing meta-path APYPA for each author (applying l2-norm)
authorNameDisambiguation.computeMay(CitationDataset(DATA_Name, dataset_lines), datautil.pub_id_ind, datautil.years_dict, 
                                    filename=STEP1_May)
# Step 1.3: computing meta-path APTPA for each author (applying l2-norm)
authorNameDisambiguation.computeMat(datautil.titles_list, filename=STEP1_Mat)
# Step 1.4: computing meta-path AVA for each author (applying PathSim)
authorNameDisambiguation.computeMav(CitationDataset(AUTHORS, authors_lines), datautil.venues_dict, datautil.auth_id_ind, 
                                    filename=STEP1_Mav)
# Step 1.5: computing the diagonal of the dot product between Mav and Mav.T
authorNameDisambiguation.computeMavDiagonal(filename=STEP1_Mav_Diag)
# Step 1.6: computing the similarity value based on meta-paths for each pair of authors
authorNameDisambiguation.r_step(np.array([1/4]*4), filename=STEP1_SIM)

***p-step***: *Improving the precision; given an author ID and the previous results, one should find its real duplicates via String-based consideration.*

In [None]:
# Step 2: for each author that passed the p_step, compute the string based similarity value with his potential duplicates
authorNameDisambiguation.p_step(datautil.authors_dict, STEP2_SIM)

***m-step***: *Merging the resulting duplicates, based on the similarity values obtained in the r-step and the p-step.*

In [None]:
# Step 3: finding the actual duplicates for each author that passed the first step, i.e. the possible duplicate authors 
# with a similarity value (string based) over a threshold
authorNameDisambiguation.m_step(filename=STEP3_DUPL)

In [None]:
# saving the dataset with the merged authors
authors = ioutil.loadAsJson(AUTHORS)
tmp = dict()
for entry in authors:
    tmp[entry['id']] = entry
authors = tmp
del tmp
mergeinfo.computeMergeDataset(authors, datautil.authors_pubs, authorNameDisambiguation.duplicates, filename=MERGED)
mergeinfo.computeIdMap(CitationDataset(AUTHORS, authors_lines), authorNameDisambiguation.duplicates, filename=MERGED_IDMAP)

### Building the citation graph

In [None]:
authors = CitationDataset(MERGED, 1142584)
datautil.auth_id_ind = dict()
i = 0
for author in authors:
    datautil.auth_id_ind[author['id']] = i
    if(i%10000==0):
        print("\rExamined: %d/%d"%(i,len(authors)),end='',flush=True)
    i += 1
print("\rExamined: %d/%d"%(i,len(authors)))

In [None]:
citegraph.adjacencyMatrix(authors, datautil.auth_id_ind, datautil.publications_dict, mergeinfo.idMap, filename=ADJ_MATRIX)
citegraph.collaborationMatrix(authors, datautil.auth_id_ind, datautil.publications_dict, mergeinfo.idMap, filename=COL_MATRIX)
citegraph.citationMatrix(authors, datautil.auth_id_ind, datautil.publications_dict, mergeinfo.idMap, filename=CIT_MATRIX)

In [None]:
authors = CitationDataset(MERGED, 1142584)
orgs_list = np.array([None]*len(authors))
i = 0
for author in authors:
    orgs_list[i] = author['org']
    if(i%10000==0):
        print("\rCompleted: %d/%d"%(i,len(authors)),end='',flush=True)
    i += 1
print("\rCompleted: %d/%d"%(i,len(authors)))
orgs_values = [orgs_list[orgs_list==''].shape[0]/orgs_list.shape[0], orgs_list[orgs_list!=''].shape[0]/orgs_list.shape[0]]

plt.figure(figsize=(6,6))
plt.pie(orgs_values, normalize=False, labels=["%.2f%s"%(orgs_values[i]*100,"%") for i in range(len(orgs_values))],
       explode=(0.0, 0.1))
plt.legend(['Missing values', 'Valid values'], loc='upper left')
plt.show()

In [None]:
citegraph.weight_collaborations(filename=Wcol_MATRIX)
citegraph.weight_citations2(filename=Wcit_MATRIX)
citegraph.weight_2loops(filename=Wlp_MATRIX)
citegraph.weight_orgs(authors, sigma=0.75, filename=Worg_MATRIX)

In [None]:
W_col_only = citegraph.sum_weight_matrices(np.array([1,0,0,0]).astype(np.float), filename=W_MATRIX_LOC+"wam_(1,0,0,0).npz")
W_cit_only = citegraph.sum_weight_matrices(np.array([0,1,0,0]).astype(np.float), filename=W_MATRIX_LOC+"wam_(0,1,0,0).npz")
W_lp_only  = citegraph.sum_weight_matrices(np.array([0,0,1,0]).astype(np.float), filename=W_MATRIX_LOC+"wam_(0,0,1,0).npz")
W_org_only = citegraph.sum_weight_matrices(np.array([0,0,0,1]).astype(np.float), filename=W_MATRIX_LOC+"wam_(0,0,0,1).npz")
W_uniform = citegraph.sum_weight_matrices(np.array([1/4]*4), filename=W_MATRIX_LOC+"wam_(0.25,0.25,0.25,0.25).npz")

### CiteRank

**Testing values for alpha**

In [None]:
r = citerank.pagerank(citegraph.Cit, tol=1.0e-8)
r_uniform = citerank.pagerank(W_uniform.multiply(citegraph.Cit), tol=1.0e-8)
r_col_only = citerank.pagerank(W_col_only.multiply(citegraph.Cit), tol=1.0e-8)
r_cit_only = citerank.pagerank(W_cit_only.multiply(citegraph.Cit), tol=1.0e-8)
r_lp_only  = citerank.pagerank(W_lp_only.multiply(citegraph.Cit), tol=1.0e-8)
r_org_only = citerank.pagerank(W_org_only.multiply(citegraph.Cit), tol=1.0e-8)

In [None]:
max_val = max(r.max(), r_uniform.max(), r_col_only.max(), r_cit_only.max(), r_lp_only.max(), r_org_only.max())
delta = 0.0001

_, axes = plt.subplots(3, 2, figsize=(17,20))

axes[0][0].plot(r, r_uniform, 'o', markersize=2, alpha=0.7, label="Uniform distribution")
axes[0][0].plot(r, r_col_only, 'o', markersize=2, alpha=0.7, label="Collaboration weights only")
axes[0][0].plot(r, r_cit_only, 'o', markersize=2, alpha=0.7, label="Citation weights only")
axes[0][0].plot(r, r_lp_only, 'o', markersize=2, alpha=0.7, label="2-loops weights only")
axes[0][0].plot(r, r_org_only, 'o', markersize=2, alpha=0.7, label="Organization weights only")
axes[0][0].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[0][0].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[0][0].set_title("Relation between PageRank and CiteRank with different beta values")
axes[0][0].set_xlabel("PageRank")
axes[0][0].set_ylabel("CiteRank")
axes[0][0].legend()
axes[0][0].grid()

axes[0][1].plot(r, r_uniform, 'o', markersize=2, alpha=0.5)
axes[0][1].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[0][1].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[0][1].set_title("CiteRank with uniform distribution of beta values")
axes[0][1].set_xlabel("PageRank")
axes[0][1].set_ylabel("CiteRank")
axes[0][1].grid()

axes[1][0].plot(r, r_col_only, 'o', markersize=2, alpha=0.5)
axes[1][0].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[1][0].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[1][0].set_title("CiteRank with only collaboration weight beta set to 1.0")
axes[1][0].set_xlabel("PageRank")
axes[1][0].set_ylabel("CiteRank")
axes[1][0].grid()

axes[1][1].plot(r, r_cit_only, 'o', markersize=2, alpha=0.5)
axes[1][1].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[1][1].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[1][1].set_title("CiteRank with only citation weight beta set to 1.0")
axes[1][1].set_xlabel("PageRank")
axes[1][1].set_ylabel("CiteRank")
axes[1][1].grid()

axes[2][0].plot(r, r_lp_only, 'o', markersize=2, alpha=0.5)
axes[2][0].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[2][0].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[2][0].set_title("CiteRank with only 2-loops weight beta set to 1.0")
axes[2][0].set_xlabel("PageRank")
axes[2][0].set_ylabel("CiteRank")
axes[2][0].grid()

axes[2][1].plot(r, r_org_only, 'o', markersize=2, alpha=0.5)
axes[2][1].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[2][1].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[2][1].set_title("CiteRank with only organization weight beta set to 1.0")
axes[2][1].set_xlabel("PageRank")
axes[2][1].set_ylabel("CiteRank")
axes[2][1].grid()

plt.show()

In [None]:
_, axes = plt.subplots(2, 2, figsize=(15,15))

axes[0][0].plot(r, r0, 'o', markersize=2, alpha=0.5)
axes[0][0].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[0][0].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[0][0].set_title("beta = [0.12, 0.28, 0.4, 0.2]")
axes[0][0].set_xlabel("PageRank")
axes[0][0].set_ylabel("CiteRank")
axes[0][0].grid()

axes[0][1].plot(r, r1, 'o', markersize=2, alpha=0.5)
axes[0][1].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[0][1].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[0][1].set_title("beta = [0.12, 0.23, 0.45, 0.2]")
axes[0][1].set_xlabel("PageRank")
axes[0][1].set_ylabel("CiteRank")
axes[0][1].grid()

axes[1][0].plot(r, r2, 'o', markersize=2, alpha=0.5)
axes[1][0].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[1][0].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[1][0].set_title("beta = [0.05, 0.15, 0.6, 0.2]")
axes[1][0].set_xlabel("PageRank")
axes[1][0].set_ylabel("CiteRank")
axes[1][0].grid()

axes[1][1].plot(r, r3, 'o', markersize=2, alpha=0.5)
axes[1][1].set_xticks(np.arange(0.0, max_val+delta, delta))
axes[1][1].set_yticks(np.arange(0.0, max_val+delta, delta))
axes[1][1].set_title("beta = [0.05, 0.3, 0.6, 0.05]")
axes[1][1].set_xlabel("PageRank")
axes[1][1].set_ylabel("CiteRank")
axes[1][1].grid()

plt.show()

In [None]:
W = W3
cr = r3
sparse.save_npz(W_MATRIX_LOC+"wam_(0.05,0.3,0.6,0.05).npz", W)
np.save(RANKS_LOC+"citerank.npy", cr)
np.save(RANKS_LOC+"pagerank.npy", r)

**Computing the score**

In [None]:
authors = CitationDataset(MERGED, 1142584)
ids = np.array([None]*len(authors))
i = 0
for author in authors:
    ids[i] = author['id']
    if(i%10000==0):
        print("\rExamined: %d/%d"%(i,len(authors)),end='',flush=True)
    i += 1
print("\rExamined: %d/%d"%(i,len(authors)))

ranks_r, ranks_cr, diffs = citerank.sorted_ranks(np.array([r, cr]), ids, diffs=True)
received_cits = np.array(citegraph.Cit.sum(axis=0)).flatten()

In [None]:
log_score = citerank.minMaxScaler(np.log10(r) - np.log10(cr))

pos_score = citerank.minMaxScaler(diffs)

z_score = citerank.minMaxScaler(zscore(cr) - zscore(r))

# original formula: np.log10(np.sqrt(np.abs(zscore_r*zscore_cr)))+(1-ranks_r/len(ranks_r))+(1-ranks_cr/len(ranks_cr))
s = (1/2)*np.log10(np.abs(zscore(r)*zscore(cr)))+(2-(ranks_r+ranks_cr)/len(ranks_r))
s = citerank.minMaxScaler(s)

#s2 = citerank.minMaxScaler(np.log10(np.abs(zscore_cr-zscore_r))+(2-(ranks_r+ranks_cr)/len(ranks_r)))

In [None]:
_, axes = plt.subplots(2, 2, figsize=(15,15))

axes[0][0].plot(received_cits, log_score, 'o', alpha=0.5)
axes[0][0].plot([0, received_cits.max()], [log_score.mean(), log_score.mean()], 'r')
axes[0][0].plot([0, received_cits.max()], [log_score.mean()+log_score.std(), log_score.mean()+log_score.std()], 'g--')
axes[0][0].plot([0, received_cits.max()], [log_score.mean()-log_score.std(), log_score.mean()-log_score.std()], 'g--')
axes[0][0].annotate("%d points"%log_score[(log_score>=log_score.mean()-log_score.std())*
                                   (log_score<=log_score.mean()+log_score.std())].shape[0],
             xy=(50000, log_score.mean()+log_score.std()-0.02), xytext=(50000, 0.3), arrowprops={'arrowstyle':'->'})
axes[0][0].set_title("Score = log10(PageRank) - log10(CiteRank)")
axes[0][0].set_xlabel("n° of received citations")
axes[0][0].set_ylabel("Score")

axes[0][1].plot(received_cits, pos_score, 'o', alpha=0.5)
axes[0][1].plot([0, received_cits.max()], [pos_score.mean(), pos_score.mean()], 'r')
axes[0][1].plot([0, received_cits.max()], [pos_score.mean()+pos_score.std(), pos_score.mean()+pos_score.std()], 'g--')
axes[0][1].plot([0, received_cits.max()], [pos_score.mean()-pos_score.std(), pos_score.mean()-pos_score.std()], 'g--')
axes[0][1].annotate("%d points"%pos_score[
    (pos_score>=pos_score.mean()-pos_score.std())*(pos_score<=pos_score.mean()+pos_score.std())].shape[0], 
    xy=(50000, pos_score.mean()+pos_score.std()-0.02), xytext=(50000, 0.8), arrowprops={'arrowstyle':'->'})
axes[0][1].set_title("Score = pos(PageRank) - pos(CiteRank)")
axes[0][1].set_xlabel("n° of received citations")
axes[0][1].set_ylabel("Score")

axes[1][0].plot(received_cits, z_score, 'o', alpha=0.5)
axes[1][0].plot([0, received_cits.max()], [z_score.mean(), z_score.mean()], 'r--')
axes[1][0].plot([0, received_cits.max()], [z_score.mean()+z_score.std(), z_score.mean()+z_score.std()], 'g--')
axes[1][0].plot([0, received_cits.max()], [z_score.mean()-z_score.std(), z_score.mean()-z_score.std()], 'g--')
axes[1][0].annotate("%d points"%z_score[(z_score>=z_score.mean()-z_score.std())*
                                     (z_score<=z_score.mean()+z_score.std())].shape[0],
             xy=(50000, z_score.mean()+z_score.std()-0.01), xytext=(50000, 0.5), arrowprops={'arrowstyle':'->'})
axes[1][0].set_title("Score = zscore(CiteRank) - zscore(PageRank)")
axes[1][0].set_xlabel("n° of received citations")
axes[1][0].set_ylabel("Score")

axes[1][1].plot(received_cits, s, 'o', alpha=0.5)
axes[1][1].plot([0, received_cits.max()], [s.mean(), s.mean()], 'r--')
axes[1][1].plot([0, received_cits.max()], [s.mean()+s.std(), s.mean()+s.std()], 'g--')
axes[1][1].plot([0, received_cits.max()], [s.mean()-s.std(), s.mean()-s.std()], 'g--')
axes[1][1].annotate("%d points"%s[(s>=s.mean()-s.std())*(s<=s.mean()+s.std())].shape[0],
             xy=(50000, s.mean()+s.std()-0.05), xytext=(50000, 0.6), arrowprops={'arrowstyle':'->'})
axes[1][1].set_title("Score = f(PageRank, CiteRank)")
axes[1][1].set_xlabel("n° of received citations")
axes[1][1].set_ylabel("Score")

plt.show()

In [None]:
prof_unical_0_id = '3821842'
prof_unical_0 = np.where(ids==prof_unical_0_id)[0][0]
prof_unical_1_id = '184075056'
prof_unical_1 = np.where(ids==prof_unical_1_id)[0][0]
prof_unical_2_id = '2071564828'
prof_unical_2 = np.where(ids==prof_unical_2_id)[0][0]
prof_unical_3_id = '2075460159'
prof_unical_3 = np.where(ids==prof_unical_3_id)[0][0]
prof_unical_4_id = '2294106506'
prof_unical_4 = np.where(ids==prof_unical_4_id)[0][0]
prof_unical_5_id = '2143117249'
prof_unical_5 = np.where(ids==prof_unical_5_id)[0][0]
prof_unical_6_id = '2163236697'
prof_unical_6 = np.where(ids==prof_unical_6_id)[0][0]
prof_unical_7_id = '273425128'
prof_unical_7 = np.where(ids==prof_unical_7_id)[0][0]
prof_unical_8_id = '1976489361'
prof_unical_8 = np.where(ids==prof_unical_8_id)[0][0]
first_cls = np.argsort(-cr)[0]
second_cls = np.argsort(-cr)[1]
last_cls = np.argsort(-cr)[-1]

df = pd.DataFrame([[r[prof_unical_0], cr[prof_unical_0], ranks_r[prof_unical_0], ranks_cr[prof_unical_0], 
                    log_score[prof_unical_0], pos_score[prof_unical_0], z_score[prof_unical_0], s[prof_unical_0]],
                   [r[prof_unical_1], cr[prof_unical_1], ranks_r[prof_unical_1], ranks_cr[prof_unical_1],
                    log_score[prof_unical_1], pos_score[prof_unical_1], z_score[prof_unical_1], s[prof_unical_1]],
                   [r[prof_unical_2], cr[prof_unical_2], ranks_r[prof_unical_2], ranks_cr[prof_unical_2],
                    log_score[prof_unical_2], pos_score[prof_unical_2], z_score[prof_unical_2], s[prof_unical_2]],
                   [r[prof_unical_3], cr[prof_unical_3], ranks_r[prof_unical_3], ranks_cr[prof_unical_3],
                    log_score[prof_unical_3], pos_score[prof_unical_3], z_score[prof_unical_3], s[prof_unical_3]],
                   [r[prof_unical_4], cr[prof_unical_4], ranks_r[prof_unical_4], ranks_cr[prof_unical_4],
                    log_score[prof_unical_4], pos_score[prof_unical_4], z_score[prof_unical_4], s[prof_unical_4]],
                   [r[prof_unical_5], cr[prof_unical_5], ranks_r[prof_unical_5], ranks_cr[prof_unical_5],
                    log_score[prof_unical_5], pos_score[prof_unical_5], z_score[prof_unical_5], s[prof_unical_5]],
                   [r[prof_unical_6], cr[prof_unical_6], ranks_r[prof_unical_6], ranks_cr[prof_unical_6],
                    log_score[prof_unical_6], pos_score[prof_unical_6], z_score[prof_unical_6], s[prof_unical_6]],
                   [r[prof_unical_7], cr[prof_unical_7], ranks_r[prof_unical_7], ranks_cr[prof_unical_7],
                    log_score[prof_unical_7], pos_score[prof_unical_7], z_score[prof_unical_7], s[prof_unical_7]],
                   [r[prof_unical_8], cr[prof_unical_8], ranks_r[prof_unical_8], ranks_cr[prof_unical_8],
                    log_score[prof_unical_8], pos_score[prof_unical_8], z_score[prof_unical_8], s[prof_unical_8]],
                   [r[first_cls], cr[first_cls], ranks_r[first_cls], ranks_cr[first_cls],
                    log_score[first_cls], pos_score[first_cls], z_score[first_cls], s[first_cls]],
                   [r[second_cls], cr[second_cls], ranks_r[second_cls], ranks_cr[second_cls],
                    log_score[second_cls], pos_score[second_cls], z_score[second_cls], s[second_cls]],
                   [r[last_cls], cr[last_cls], ranks_r[last_cls], ranks_cr[last_cls],
                    log_score[last_cls], pos_score[last_cls], z_score[last_cls], s[last_cls]]],
                  index=['Docente Unical 0', 'Docente Unical 1', 'Docente Unical 2', 'Docente Unical 3',
                         'Docente Unical 4', 'Docente Unical 5', 'Docente Unical 6', 'Docente Unical 7',
                         'Docente Unical 8', 'First classified', 'Second classified', 'Last classified'], 
                  columns=['PageRank', 'CiteRank', 'Pos_PageRank', 
                           'Pos_CiteRank', 'Log_S', 'Pos_S', 'Z_S', 'S'])
df.head(12)

**Searching correlations**

In [None]:
s = np.load(RANKS_LOC+"scores.npy")
citegraph.loadSelfMatrices(Cit=CIT_MATRIX, Col=COL_MATRIX, A=ADJ_MATRIX)
citegraph.computePubsNum(CitationDataset(MERGED, 1142584))
received_cits = np.array(citegraph.Cit.sum(axis=0)).flatten()
A_loops = citegraph.A.multiply(citegraph.A.T)
attitudes = A_loops.getnnz(axis=1)/np.where(citegraph.A.getnnz(axis=1)==0.0, 1.0, citegraph.A.getnnz(axis=1))
D = sparse.spdiags(attitudes, 0, *A_loops.shape, format="csr")
Tol = A_loops - (D * A_loops)
mean_tolerances = np.array(Tol.mean(axis=1)).flatten()

data = np.hstack((citegraph.pubs_num.reshape((-1,1)), received_cits.reshape((-1,1)),
                  mean_tolerances.reshape((-1,1)), s.reshape((-1,1))))
columns = ['Pubs', 'Cits', 'Tol', 'Score']
df = pd.DataFrame(data, columns=columns)

In [None]:
_, axes = plt.subplots(2, 1, figsize=(9,15))
sn.heatmap(df.corr(method='pearson'), annot=True, cmap=sn.color_palette("Blues", as_cmap=True).reversed(), ax=axes[0])
sn.heatmap(df.corr(method='spearman'), annot=True, cmap=sn.color_palette("Blues", as_cmap=True).reversed(), ax=axes[1])
for i in range(len(axes)):
    for tick in axes[i].get_yticklabels():
        tick.set_rotation(0)
axes[0].set_title("Correlation matrix (Pearson's coefficient)")
axes[1].set_title("Correlation matrix (Spearman's coefficient)")
plt.show()