# 1. Initialize and import libraries

In [1]:
%matplotlib inline
import sys
import pandas as pd
import numpy as np
import string
import networkx as nx
import pickle
from sklearn.preprocessing import MinMaxScaler

In [2]:
#From within a python file, you can add pathto the default path by adding the following lines. Now the pmi_utils is added to the library path
sys.path.insert(0, 'pmi_utils')

In [3]:
import shared_variables
from shared_variables import *

In [4]:
shared_variables = reload(shared_variables)

In [5]:
#read the preprocessed dblp json file into a numpy dataframe
df_json_dblp = pd.read_json(f_json_dblp)
# the rows represent the number of articles
# the columns represent the features. Feature/column names are added later
df_json_dblp.shape

(6141934, 9)

In [6]:
#The dataframe is without column names.
#For the sake of order columns are given proper names
# The variable columns resides in shared_variables.py
df_json_dblp.columns = columns

# 2. Unpack Authors
- The set of authors is included in one feature of the dataframe.
- In order to assign a value to one author, the author string needs to be splitted.
- The function _unpack__authors_ is doing this

In [7]:
def unpack_authors(df):
    df_dblp_paper_author = []
    for idx, row in df.iterrows():
        authors = row[lbl_authors]
        for a in authors:
            a = a.strip()
            df_dblp_paper_author.append([row[lbl_paper_id], a])
    return pd.DataFrame(df_dblp_paper_author, columns=[lbl_paper_id, lbl_author])

In [8]:
#If the preprocessed file exists, take the file
# Otherwise unpack the authors from the original dataset
try:
    with open(f_json_by_author, 'r') as f:
        pass
    df_json_by_author = pd.read_csv(f_json_by_author)
except:
    df_json_by_author = unpack_authors(df_json_dblp)
    df_json_by_author.to_csv(f_json_by_author, sep=',', index=False, encoding='utf-8')

In [9]:
#The structure of the json by author. Please be advised that several authors for one paper result in several lines.
df_json_by_author.head()

Unnamed: 0,paper_id,author
0,journals/acta/Saxena96,Sanjeev Saxena
1,journals/acta/Simon83,Hans Ulrich Simon
2,journals/acta/GoodmanS83,Nathan Goodman
3,journals/acta/GoodmanS83,Oded Shmueli
4,journals/acta/Blum82,Norbert Blum


In [10]:
#The first number represent the number of article-author pairs
df_json_by_author.shape

(13855925, 2)

In [11]:
#Merge the full article set with the article-author pairs.
#Every line represents one article with one author.
#Articles with several authors are represented by several lines, for which only the author field is different
df_json_dblp_by_author = pd.merge(df_json_dblp, df_json_by_author, on=lbl_paper_id)
df_json_dblp_by_author.sort_values(by=lbl_paper_id).head()

Unnamed: 0,paper_id,papertype,title,authors,journal,booktitle,year,ee,url,author
5039065,books/acm/0082477,book,The no-nonsense guide to computing careers.,[Marc Rettig],,,1992.0,,,Marc Rettig
5039033,books/acm/kim95/AnnevelinkACFHK95,incollection,Object SQL - A Language for the Design and Imp...,"[Jurgen Annevelink, Rafiul Ahad, Amelia Carlso...",,Modern Database Systems,1995.0,,db/books/collections/kim95.html#AnnevelinkACFHK95,Amelia Carlson
5039036,books/acm/kim95/AnnevelinkACFHK95,incollection,Object SQL - A Language for the Design and Imp...,"[Jurgen Annevelink, Rafiul Ahad, Amelia Carlso...",,Modern Database Systems,1995.0,,db/books/collections/kim95.html#AnnevelinkACFHK95,William Kent
5039035,books/acm/kim95/AnnevelinkACFHK95,incollection,Object SQL - A Language for the Design and Imp...,"[Jurgen Annevelink, Rafiul Ahad, Amelia Carlso...",,Modern Database Systems,1995.0,,db/books/collections/kim95.html#AnnevelinkACFHK95,Michael L. Heytens
5039034,books/acm/kim95/AnnevelinkACFHK95,incollection,Object SQL - A Language for the Design and Imp...,"[Jurgen Annevelink, Rafiul Ahad, Amelia Carlso...",,Modern Database Systems,1995.0,,db/books/collections/kim95.html#AnnevelinkACFHK95,Daniel H. Fishman


pickle.dump(df_json_dblp_by_author, open(f_dblp_by_author, 'w'))

# 3. Authors' level of expertise by topic
- In order to assess the author's level of expertise per topic, the following KPIs are calculated for an author on a specific cluster
 1. Number of publications
 1. The average number of recent publications per year (for the last 3 years)
 1. Total years of experience
- *paper_clusters.csv* includes the mapping of which paper (on paper_id) is assigned which cluster

In [12]:
df_paper_clusters = pd.read_csv(f_paper_clusters, sep=',')
df_paper_clusters.head()

Unnamed: 0,paper_id,cluster
0,journals/corr/abs-1101-0906,network_multihop_wireless
1,journals/corr/SpithourakisPR16,semantic_textual_document
2,journals/corr/PromayonFBDFHLPSSSVCT13,interactive_interaction_mixedreality
3,journals/corr/abs-1711-08963,antichains_graphs_multigraphs
4,journals/corr/ChaliseSZK17,channel_multiantenna_mimo


In [13]:
# The assigned cluster is added to the general dataframe, of article-author pairs
df_json_dblp_by_author = pd.merge(df_json_dblp_by_author, df_paper_clusters, on=lbl_paper_id)
df_json_dblp_by_author.head()

Unnamed: 0,paper_id,papertype,title,authors,journal,booktitle,year,ee,url,author,cluster
0,journals/acta/HuangL87,article,The Derivation of Systolic Implementations of ...,"[Chua-Huang Huang, Christian Lengauer]",Acta Inf.,,1987.0,https://doi.org/10.1007/BF00282618,db/journals/acta/acta24.html#HuangL87,Chua-Huang Huang,hardware_multiprocessor_multicore
1,journals/acta/HuangL87,article,The Derivation of Systolic Implementations of ...,"[Chua-Huang Huang, Christian Lengauer]",Acta Inf.,,1987.0,https://doi.org/10.1007/BF00282618,db/journals/acta/acta24.html#HuangL87,Christian Lengauer,hardware_multiprocessor_multicore
2,journals/acta/FinkelC87,article,Fifo Nets Without Order Deadlock.,"[Alain Finkel, Annie Choquet]",Acta Inf.,,1988.0,https://doi.org/10.1007/BF00268843,db/journals/acta/acta25.html#FinkelC87,Alain Finkel,scheduling_qos_routing
3,journals/acta/FinkelC87,article,Fifo Nets Without Order Deadlock.,"[Alain Finkel, Annie Choquet]",Acta Inf.,,1988.0,https://doi.org/10.1007/BF00268843,db/journals/acta/acta25.html#FinkelC87,Annie Choquet,scheduling_qos_routing
4,journals/acta/Hesselink13,article,Verifying a simplification of mutual exclusion...,[Wim H. Hesselink],Acta Inf.,,2013.0,https://doi.org/10.1007/s00236-013-0178-2,db/journals/acta/acta50.html#Hesselink13,Wim H. Hesselink,infinitary_finitary_equational


In [14]:
# num_pubs includes the number of publication per author in a certain cluster
num_pubs = df_json_dblp_by_author.groupby([lbl_author, lbl_cluster], as_index=False)[lbl_paper_id].count()
num_pubs.rename(columns={lbl_paper_id: 'num_pubs'}, inplace=True)
num_pubs.head()

Unnamed: 0,author,cluster,num_pubs
0,(David) Jing Dai,budapest_conference_workshop,1
1,A Lun,llamada_restent_keinen,1
2,A Min Tjoa,analysis_methodology_modeling,1
3,A Min Tjoa,data_metadata_database,6
4,A Min Tjoa,educational_elearning_education,2


In [15]:
# Average year publication rate in the last 3 years
pub_rate = df_json_dblp_by_author.query('year >= 2015')\
                .groupby([lbl_author, lbl_cluster, lbl_year], as_index=False)[lbl_paper_id].count() \
                .groupby([lbl_author, lbl_cluster], as_index=False)[lbl_paper_id].sum()
pub_rate[lbl_paper_id] /= 3.
pub_rate.rename(columns={lbl_paper_id: 'pub_rate'}, inplace=True)
pub_rate.head()

Unnamed: 0,author,cluster,pub_rate
0,(David) Jing Dai,budapest_conference_workshop,0.333333
1,A Min Tjoa,analysis_methodology_modeling,0.333333
2,A Min Tjoa,data_metadata_database,0.333333
3,A Min Tjoa,educational_elearning_education,0.333333
4,A-Long Jin,channel_multiantenna_mimo,0.333333


In [16]:
# Years of experience
df_years_exp = df_json_dblp_by_author.groupby([lbl_author, lbl_cluster], as_index=False)[lbl_year].agg([min, max])
# Plus 1 because if used as factor then it annihilates the product
years_exp = (df_years_exp['max'] - df_years_exp['min'] + 1)
del df_years_exp
years_exp = years_exp.reset_index().rename(columns={0: lbl_paper_id})
years_exp.rename(columns={lbl_paper_id: 'years_exp'}, inplace=True)
years_exp.head()

Unnamed: 0,author,cluster,years_exp
0,(David) Jing Dai,budapest_conference_workshop,1.0
1,A Lun,llamada_restent_keinen,1.0
2,A Min Tjoa,analysis_methodology_modeling,1.0
3,A Min Tjoa,data_metadata_database,21.0
4,A Min Tjoa,educational_elearning_education,11.0


Now merge the above calculated KPIs into a unique DataFrame

In [17]:
aut_kpis = pd.merge(num_pubs, pub_rate, on=[lbl_author, lbl_cluster])
aut_kpis.head()

Unnamed: 0,author,cluster,num_pubs,pub_rate
0,(David) Jing Dai,budapest_conference_workshop,1,0.333333
1,A Min Tjoa,analysis_methodology_modeling,1,0.333333
2,A Min Tjoa,data_metadata_database,6,0.333333
3,A Min Tjoa,educational_elearning_education,2,0.333333
4,A-Long Jin,channel_multiantenna_mimo,1,0.333333


In [18]:
aut_kpis = pd.merge(aut_kpis, years_exp, on=[lbl_author, lbl_cluster])
aut_kpis.head()

Unnamed: 0,author,cluster,num_pubs,pub_rate,years_exp
0,(David) Jing Dai,budapest_conference_workshop,1,0.333333,1.0
1,A Min Tjoa,analysis_methodology_modeling,1,0.333333,1.0
2,A Min Tjoa,data_metadata_database,6,0.333333,21.0
3,A Min Tjoa,educational_elearning_education,2,0.333333,11.0
4,A-Long Jin,channel_multiantenna_mimo,1,0.333333,1.0


Now all the three KPIs are included in the aut_kpi dataframe

This has to be added to the pagerank result, which is calculated next.

# 4. Data Augmentation with data from Citation Network
- The idea is to create a directed network of papers that cite other papers.
- A network centrality could be applied in order to assess the importance of a paper.
- This paper importance could then be added to an author to have another KPI of an author.
- The chosen centrality is called PageRank (developed by Google to assess importance of a website, that refers to other websites).

In [19]:
json_cit_parts = range(4)
df_json_cit = pd.DataFrame()
for i in json_cit_parts:
    f_json_cit = path + f_dblp_ref  % i
    df_json_cit = pd.concat([df_json_cit,
                             pd.read_json(f_json_cit, lines=True)])

In [20]:
# the total number of citations are
df_json_cit.shape

(3079007, 7)

In [21]:
#important columns here are the title, id and references
# a reference is when the id of another paper is mentioned in the reference column of one paper
df_json_cit.head()

Unnamed: 0,abstract,authors,id,references,title,venue,year
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",00127ee2-cb05-48ce-bc49-9de556b93346,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",001c58d3-26ad-46b3-ab3a-c1e557d16821,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011
2,This article applied GARCH model instead AR or...,"[Altaf Hossain, Faisal Zaman, Mohammed Nasser,...",001c8744-73c4-4b04-9364-22d31a10dbf1,"[2d84c0f2-e656-4ce7-b018-90eda1c132fe, a083a1b...","Comparison of GARCH, Neural Network and Suppor...",pattern recognition and machine intelligence,2009
3,,"[Jea-Bum Park, Byungmok Kim, Jian Shen, Sun-Yo...",00338203-9eb3-40c5-9f31-cbac73a519ec,"[8c78e4b0-632b-4293-b491-85b1976675e6, 9cdc54f...",Development of Remote Monitoring and Control D...,,2011
4,,"[Giovanna Guerrini, Isabella Merlo]",0040b022-1472-4f70-a753-74832df65266,,Reasonig about Set-Oriented Methods in Object ...,,1998


Filter by matching the title exactly with our dblp dataset

In [22]:
df_merged = pd.merge(df_json_cit, df_json_dblp_by_author, on='title')
df_merged.shape
df_merged.head()

Unnamed: 0,abstract,authors_x,id,references,title,venue,year_x,paper_id,papertype,authors_y,journal,booktitle,year_y,ee,url,author
0,,"[Giovanna Guerrini, Isabella Merlo]",0040b022-1472-4f70-a753-74832df65266,,Reasonig about Set-Oriented Methods in Object ...,,1998,conf/sebd/GuerriniM98,inproceedings,"[Giovanna Guerrini, Isabella Merlo]",,SEBD,1998.0,,db/conf/sebd/sebd1998.html#GuerriniM98,Giovanna Guerrini
1,,"[Giovanna Guerrini, Isabella Merlo]",0040b022-1472-4f70-a753-74832df65266,,Reasonig about Set-Oriented Methods in Object ...,,1998,conf/sebd/GuerriniM98,inproceedings,"[Giovanna Guerrini, Isabella Merlo]",,SEBD,1998.0,,db/conf/sebd/sebd1998.html#GuerriniM98,Isabella Merlo
2,,"[Jovan Dj. Golic, Guglielmo Morgari]",00638a94-23bf-4fa6-b5ce-40d799c65da7,,Vectorial fast correlation attacks.,,2004,journals/iacr/GolicM04,article,"[Jovan Dj. Golic, Guglielmo Morgari]",IACR Cryptology ePrint Archive,,2004.0,http://eprint.iacr.org/2004/247,db/journals/iacr/iacr2004.html#GolicM04,Jovan Dj. Golic
3,,"[Jovan Dj. Golic, Guglielmo Morgari]",00638a94-23bf-4fa6-b5ce-40d799c65da7,,Vectorial fast correlation attacks.,,2004,journals/iacr/GolicM04,article,"[Jovan Dj. Golic, Guglielmo Morgari]",IACR Cryptology ePrint Archive,,2004.0,http://eprint.iacr.org/2004/247,db/journals/iacr/iacr2004.html#GolicM04,Guglielmo Morgari
4,,"[Pranay Chaudhuri, Hussein Thompson]",00745041-3636-4d18-bbec-783c4278c40d,,A Self-Stabilizing Algorithm for Finding the C...,parallel and distributed processing techniques...,2003,conf/pdpta/ChaudhuriT03,inproceedings,"[Pranay Chaudhuri, Hussein Thompson]",,PDPTA,2003.0,,db/conf/pdpta/pdpta2003-3.html#ChaudhuriT03,Pranay Chaudhuri


# 5. Construct the citation network
- In order to calculate the Pagerank Centrality for every author (=node in network), we need the edges between them (=citations)
- For that, we first create an adjacency list.
    - The adjacency list exists of paper ID's, constructed from the rows in our dataframe.
    - If there is a citation from paper A to paper B, the adjacency list looks as follows: (id_A , id_B)
    - If there is a paper without citation, there is only its id, (id_A)

In [23]:
#if there are references, the adjacency list element is a tuple like (id_A, id_B)
#if there are no references, the adjacency list element is a sinlge value (id_A)
def adjacency_list(row):
    try:
        return row[lbl_id] + ',' + ','.join(row[lbl_references])
    except:
        return row[lbl_id]

In [24]:
adj_list = df_merged.apply(adjacency_list, axis=1)
adj_list[:10]

0    0040b022-1472-4f70-a753-74832df65266
1    0040b022-1472-4f70-a753-74832df65266
2    00638a94-23bf-4fa6-b5ce-40d799c65da7
3    00638a94-23bf-4fa6-b5ce-40d799c65da7
4    00745041-3636-4d18-bbec-783c4278c40d
5    00745041-3636-4d18-bbec-783c4278c40d
6    00dc2bba-3237-4d4e-b541-1205b97df981
7    00dc2bba-3237-4d4e-b541-1205b97df981
8    00dc2bba-3237-4d4e-b541-1205b97df981
9    00e02aeb-b424-4ca8-b3ca-6e18e322f79e
dtype: object

In [25]:
#transform the adjacency list to a network
G = nx.parse_adjlist(adj_list.values, nodetype=str, delimiter=',')

In [26]:
# The number of authors in the network
len(G.nodes())

599340

In [27]:
# The number of edges in the network
len(G.edges())

927016

In [28]:
# Calculate the pagerank centrality for every node (author)
pr = nx.pagerank(G, alpha=0.9, max_iter=10)

In [29]:
# Pagerank is a value between 1e-6 and 1e-2
pd.Series(pr).describe()

count    5.993400e+05
mean     1.668502e-06
std      1.646517e-05
min      2.067470e-07
25%      4.665886e-07
50%      6.368175e-07
75%      1.282507e-06
max      1.247568e-02
dtype: float64

In [30]:
# reset the index
# give the columns a name
df_ranks = pd.DataFrame(pd.Series(pr))
df_ranks.reset_index(inplace=True)
df_ranks.columns = [lbl_id, lbl_rank]
df_ranks.head()

Unnamed: 0,id,cite_rank
0,,0.01247568
1,00001301-0e56-4c9d-94aa-f776580aed87,5.632725e-07
2,00001d52-f2bd-4137-8d6b-b34e65a972bf,2.06747e-07
3,000020ae-0106-46dd-a3c9-610e3bd9e4e4,2.06747e-07
4,000024b3-0d71-41ff-a66d-12a72e47af9b,2.06747e-07


In [31]:
# merge this pagerank (or citerank) into our paper dataframe
df_merged_pr = pd.merge(df_merged, df_ranks, on=lbl_id)
df_merged_pr.sample(n=5)

Unnamed: 0,abstract,authors_x,id,references,title,venue,year_x,paper_id,papertype,authors_y,journal,booktitle,year_y,ee,url,author,cite_rank
604230,,[Geoffroy Peeters],ea40186d-fa9e-4b37-b221-7c8643cf9a2f,"[04653811-8458-4ae1-b3fb-ca3fdae90216, 1a52008...",Sequence Representation of Music Structure Usi...,international symposium/conference on music in...,2007,conf/ismir/Peeters07,inproceedings,[Geoffroy Peeters],,ISMIR,2007.0,http://ismir2007.ismir.net/proceedings/ISMIR20...,db/conf/ismir/ismir2007.html#Peeters07,Geoffroy Peeters,4.215206e-06
598911,,"[Gábor Korchmáros, Angelo Sonnino]",e1b37c62-fcc1-4533-9a94-ca785b95c4be,,Doubly transitive parabolic ovals in affine pl...,Ars Combinatoria,2012,journals/arscom/KorchmarosS12,article,"[Gábor Korchmáros, Angelo Sonnino]",Ars Comb.,,2012.0,,db/journals/arscom/arscom105.html#KorchmarosS12,Angelo Sonnino,2.06747e-07
191444,,"[Martin E. Hellman, Ehud D. Karnin, Justin M. ...",8164deb5-8262-4426-bdb3-55c6fd900c3d,,On the Necessity of Exhaustive Search for Syst...,international cryptology conference,1981,conf/crypto/HellmanKR81,inproceedings,"[Martin E. Hellman, Ehud D. Karnin, Justin M. ...",,CRYPTO,1981.0,,db/conf/crypto/crypto81.html#HellmanKR81,Justin M. Reyneri,2.06747e-07
688708,,"[Manfred Ueberall, Christoph Dorsch, Stefan Pf...",c0a203fa-3264-4f5f-8999-3647b29ce109,[860b0199-14c9-4634-9b66-b35a3ee1d523],E2E-Prozessverbesserung auf Betriebsmodellebene.,,2015,journals/wm/UeberallDPRW15,article,"[Manfred Ueberall, Christoph Dorsch, Stefan Pf...",Wirtschaftsinformatik & Management,,2015.0,https://doi.org/10.1007/s35764-015-0520-2,db/journals/wm/wm7.html#UeberallDPRW15,Christoph Dorsch,1.998885e-06
285708,Obtaining a good load balance is a significant...,"[Derek Groen, David Abou Chacra, Rupert W. Nas...",7331bb35-945b-4cea-87cb-453b00c46c9f,"[0cf8b89b-1372-4f75-b3c9-471f888c079e, 5830947...",Weighted Decomposition in High-Performance Lat...,"arXiv: Distributed, Parallel, and Cluster Comp...",2014,conf/easc/GroenCNJBC14,inproceedings,"[Derek Groen, David Abou Chacra, Rupert W. Nas...",,EASC,2014.0,https://doi.org/10.1007/978-3-319-15976-8_2,db/conf/easc/easc2014.html#GroenCNJBC14,Derek Groen,7.55448e-06


In [32]:
# because not all papers from arxiv where in our dataset
# the distribution of citerank is slightly different
df_merged_pr[lbl_rank].describe()

count    7.411400e+05
mean     2.830532e-06
std      5.048981e-06
min      2.067470e-07
25%      2.067470e-07
50%      5.632725e-07
75%      3.819895e-06
max      2.881912e-04
Name: cite_rank, dtype: float64

In [33]:
#save paper ranks to a file
df_paper_ranks = df_merged_pr[[lbl_paper_id, lbl_rank]]
df_paper_ranks.to_csv(f_paper_ranks, sep=',', index=False)

Merge paper ranks with their inferred topics from clusters.

In [34]:
df_paper_ranks_clusters = pd.merge(df_paper_ranks, df_paper_clusters, on=lbl_paper_id)
df_paper_ranks_clusters.head()

Unnamed: 0,paper_id,cite_rank,cluster
0,conf/interspeech/PorteleGEKTV03,2e-06,interactive_interaction_mixedreality
1,conf/interspeech/PorteleGEKTV03,2e-06,interactive_interaction_mixedreality
2,conf/interspeech/PorteleGEKTV03,2e-06,interactive_interaction_mixedreality
3,conf/interspeech/PorteleGEKTV03,2e-06,interactive_interaction_mixedreality
4,conf/interspeech/PorteleGEKTV03,2e-06,interactive_interaction_mixedreality


Now merge with unpacked authors in order to have authors' ranks.

In [35]:
df_json_by_author.head()

Unnamed: 0,paper_id,author
0,journals/acta/Saxena96,Sanjeev Saxena
1,journals/acta/Simon83,Hans Ulrich Simon
2,journals/acta/GoodmanS83,Nathan Goodman
3,journals/acta/GoodmanS83,Oded Shmueli
4,journals/acta/Blum82,Norbert Blum


In [36]:
df_authors_ranks = pd.merge(df_json_by_author, df_paper_ranks_clusters, on=lbl_paper_id)

In [37]:
df_authors_ranks.shape

(310148, 4)

In [38]:
df_authors_ranks.head()

Unnamed: 0,paper_id,author,cite_rank,cluster
0,journals/acta/Mahmoud04a,Hosam M. Mahmoud,2e-06,antichains_graphs_multigraphs
1,journals/acta/BoassonCN73,Luc Boasson,5e-06,llamada_restent_keinen
2,journals/acta/BoassonCN73,Luc Boasson,5e-06,llamada_restent_keinen
3,journals/acta/BoassonCN73,Luc Boasson,5e-06,llamada_restent_keinen
4,journals/acta/BoassonCN73,J. P. Crestin,5e-06,llamada_restent_keinen


In [39]:
# sum of the citeranks to have a total per author per cluster
df_authors_ranks = df_authors_ranks.groupby([lbl_author, lbl_cluster],
                                            as_index=False)[lbl_rank].sum()
df_authors_ranks.describe()

Unnamed: 0,cite_rank
count,69490.0
mean,1.461818e-05
std,6.184222e-05
min,2.06747e-07
25%,8.269881e-07
50%,2.269895e-06
75%,1.486213e-05
max,0.002158726


In [40]:
df_authors_ranks.head()

Unnamed: 0,author,cluster,cite_rank
0,A Min Tjoa,data_metadata_database,3.1e-05
1,A Min Tjoa,educational_elearning_education,1e-06
2,A Min Tjoa,interactive_interaction_mixedreality,6e-06
3,A. A. Kuandykov,software_componentbased_enterprise,2.7e-05
4,A. A. Maidabino,educational_elearning_education,4e-06


Merge with aut_kpis

In [41]:
aut_kpis = pd.merge(aut_kpis, df_authors_ranks, how='outer', on=[lbl_author, lbl_cluster])
aut_kpis.head()

Unnamed: 0,author,cluster,num_pubs,pub_rate,years_exp,cite_rank
0,(David) Jing Dai,budapest_conference_workshop,1.0,0.333333,1.0,
1,A Min Tjoa,analysis_methodology_modeling,1.0,0.333333,1.0,
2,A Min Tjoa,data_metadata_database,6.0,0.333333,21.0,3.1e-05
3,A Min Tjoa,educational_elearning_education,2.0,0.333333,11.0,1e-06
4,A-Long Jin,channel_multiantenna_mimo,1.0,0.333333,1.0,


In [42]:
aut_kpis.shape

(310144, 6)

Fill null values with zero as they have zero score.

In [43]:
aut_kpis.fillna(0, inplace=True)

Rescale ranks between 0 and 1

In [44]:
scaler = MinMaxScaler()
columns_to_scale = [col for col in aut_kpis.columns if col not in [lbl_author, lbl_cluster]]
aut_kpis[columns_to_scale] = scaler.fit_transform(aut_kpis[columns_to_scale])
aut_kpis.head()

Unnamed: 0,author,cluster,num_pubs,pub_rate,years_exp,cite_rank
0,(David) Jing Dai,budapest_conference_workshop,0.014706,0.047619,0.018519,0.0
1,A Min Tjoa,analysis_methodology_modeling,0.014706,0.047619,0.018519,0.0
2,A Min Tjoa,data_metadata_database,0.088235,0.047619,0.388889,0.01449
3,A Min Tjoa,educational_elearning_education,0.029412,0.047619,0.203704,0.000479
4,A-Long Jin,channel_multiantenna_mimo,0.014706,0.047619,0.018519,0.0


In [45]:
aut_kpis.describe()

Unnamed: 0,num_pubs,pub_rate,years_exp,cite_rank
count,310144.0,310144.0,310144.0,310144.0
mean,0.019449,0.045386,0.042286,0.001517
std,0.023385,0.029274,0.07498,0.013851
min,0.0,0.0,0.0,0.0
25%,0.014706,0.047619,0.018519,0.0
50%,0.014706,0.047619,0.018519,0.0
75%,0.014706,0.047619,0.018519,0.0
max,1.0,1.0,1.0,1.0


In [46]:
#Overview of the highest ten cited authors
aut_kpis[lbl_rank].sort_values(ascending=False).head(10)

273278    1.0
284529    1.0
306144    1.0
308445    1.0
273186    1.0
286723    1.0
270395    1.0
262474    1.0
285288    1.0
271750    1.0
Name: cite_rank, dtype: float64

In [47]:
#save the author ranks to a file
f_author_ranks = path + 'author_ranks.csv'
aut_kpis.to_csv(f_author_ranks, sep=',', index=False)