# 1. Initialize and import libraries

In [1]:
import pandas as pd
import numpy as np
import sys
import pickle

import warnings
warnings.filterwarnings('ignore') 

In [2]:
sys.path.insert(0, 'pmi_utils')

In [3]:
import shared_variables
from shared_variables import *

In [4]:
shared_variables = reload(shared_variables)

# 2. Split authors between mentors and mentees based on their experience
- The dataset has to be splitted into authors, that are mentors and authors that are mentees
- Criteria for being a mentee are that
 - their first publication in 2017 or later, OR
 - at most 1 publication.

In [5]:
# if first publication >= 2017 or number of publications < 2 then mentee, else mentor
df_json_dblp_by_author = pickle.load(open(f_dblp_by_author, 'r'))

In [6]:
df_json_dblp_by_author.head()

Unnamed: 0,paper_id,papertype,title,authors,journal,booktitle,year,ee,url,author
0,journals/acta/Saxena96,article,Parallel Integer Sorting and Simulation Amongs...,[Sanjeev Saxena],Acta Inf.,,1996.0,https://doi.org/10.1007/BF03036466,db/journals/acta/acta33.html#Saxena96,Sanjeev Saxena
1,journals/acta/Simon83,article,Pattern Matching in Trees and Nets.,[Hans Ulrich Simon],Acta Inf.,,1983.0,https://doi.org/10.1007/BF01257084,db/journals/acta/acta20.html#Simon83,Hans Ulrich Simon
2,journals/acta/GoodmanS83,article,NP-complete Problems Simplified on Tree Schemas.,"[Nathan Goodman, Oded Shmueli]",Acta Inf.,,1983.0,https://doi.org/10.1007/BF00289414,db/journals/acta/acta20.html#GoodmanS83,Nathan Goodman
3,journals/acta/GoodmanS83,article,NP-complete Problems Simplified on Tree Schemas.,"[Nathan Goodman, Oded Shmueli]",Acta Inf.,,1983.0,https://doi.org/10.1007/BF00289414,db/journals/acta/acta20.html#GoodmanS83,Oded Shmueli
4,journals/acta/Blum82,article,On the Power of Chain Rules in Context Free Gr...,[Norbert Blum],Acta Inf.,,1982.0,https://doi.org/10.1007/BF00264161,db/journals/acta/acta17.html#Blum82,Norbert Blum


For each author calculate the year of his first publication and the number of publications

In [7]:
aut_pubs = df_json_dblp_by_author.groupby(lbl_author, as_index=False) \
                .agg({lbl_year: 'min', lbl_paper_id: 'count'})

In [8]:
aut_pubs.head()

Unnamed: 0,author,paper_id,year
0,'Maseka Lesaoana,3,2001.0
1,(David) Jing Dai,2,2017.0
2,(Max) Zong-Ming Cheng,3,2009.0
3,(Zhou) Bryan Bai,3,2011.0
4,0018 Jien Kato,2,2017.0


Add a *role* column, which splits between mentor and mentee according to the above mentioned rule.

In [9]:
aut_pubs[lbl_role] = aut_pubs[lbl_year].apply(lambda x: 'mentee' if x >= 2017 else 'mentor')
aut_pubs[lbl_role] = aut_pubs.apply(lambda row: 'mentee' \
                                        if row[lbl_role] == 'mentor' and row[lbl_paper_id] < 2 \
                                        else row[lbl_role],
                                    axis=1)
aut_pubs.head()

Unnamed: 0,author,paper_id,year,role
0,'Maseka Lesaoana,3,2001.0,mentor
1,(David) Jing Dai,2,2017.0,mentee
2,(Max) Zong-Ming Cheng,3,2009.0,mentor
3,(Zhou) Bryan Bai,3,2011.0,mentor
4,0018 Jien Kato,2,2017.0,mentee


In [10]:
#summary of mentors and mentees
aut_pubs.groupby(lbl_role).size()

role
mentee     147784
mentor    1945932
dtype: int64

In [11]:
df_author_role = aut_pubs[[lbl_author, lbl_role]]
df_author_ranks = pd.read_csv(f_author_ranks, sep=',')
df_author_ranks.head()

Unnamed: 0,author,cluster,num_pubs,pub_rate,years_exp,cite_rank
0,(David) Jing Dai,budapest_conference_workshop,0.014706,0.047619,0.018519,0.0
1,A Min Tjoa,analysis_methodology_modeling,0.014706,0.047619,0.018519,0.0
2,A Min Tjoa,data_metadata_database,0.088235,0.047619,0.388889,0.01449
3,A Min Tjoa,educational_elearning_education,0.029412,0.047619,0.203704,0.000479
4,A-Long Jin,channel_multiantenna_mimo,0.014706,0.047619,0.018519,0.0


In [12]:
df_author_ranks.shape

(310144, 6)

In [13]:
df_mentor_ranks = df_author_ranks[df_author_ranks[lbl_author].isin(
                    df_author_role[df_author_role[lbl_role] == 'mentor'][lbl_author])] \
                        .rename(columns={lbl_author: lbl_mentor})
df_mentor_ranks.head()

Unnamed: 0,mentor,cluster,num_pubs,pub_rate,years_exp,cite_rank
1,A Min Tjoa,analysis_methodology_modeling,0.014706,0.047619,0.018519,0.0
2,A Min Tjoa,data_metadata_database,0.088235,0.047619,0.388889,0.01449
3,A Min Tjoa,educational_elearning_education,0.029412,0.047619,0.203704,0.000479
4,A-Long Jin,channel_multiantenna_mimo,0.014706,0.047619,0.018519,0.0
5,A-Long Jin,services_infrastructure_middleware,0.014706,0.047619,0.018519,0.0


In [14]:
df_mentor_ranks.to_csv(f_mentor_ranks, sep=',', index=False, encoding='utf-8')

In [15]:
df_mentee_ranks = df_author_ranks[df_author_ranks[lbl_author].isin(
                    df_author_role[df_author_role[lbl_role] == 'mentee'][lbl_author])] \
                        .rename(columns={lbl_author: lbl_mentee})
df_mentee_ranks.head()

Unnamed: 0,mentee,cluster,num_pubs,pub_rate,years_exp,cite_rank
0,(David) Jing Dai,budapest_conference_workshop,0.014706,0.047619,0.018519,0.0
20,A. A. Louis Beex,whitening_prewhitening_subband,0.014706,0.047619,0.018519,0.0
25,A. A. Shpiganovich,circuit_lcvco_switchedcapacitor,0.014706,0.047619,0.018519,0.0
34,A. Abdul Khadar,scheduling_qos_routing,0.014706,0.047619,0.018519,0.0
35,A. Abdul Rahim,educational_elearning_education,0.014706,0.047619,0.018519,0.0


Mentees should share preferences over topics and over the type of mentors they'd like to have.

In [16]:
# For convenience, we assume the mentees' cluster_pref score is the average score among their kpis.
df_mentee_ranks[lbl_cluster_pref] = df_mentee_ranks[[lbl_num_pubs,
                                                     lbl_pub_rate,
                                                     lbl_years_exp,
                                                     lbl_rank]].mean(axis=1)
df_mentee_ranks.head()

Unnamed: 0,mentee,cluster,num_pubs,pub_rate,years_exp,cite_rank,cluster_pref
0,(David) Jing Dai,budapest_conference_workshop,0.014706,0.047619,0.018519,0.0,0.020211
20,A. A. Louis Beex,whitening_prewhitening_subband,0.014706,0.047619,0.018519,0.0,0.020211
25,A. A. Shpiganovich,circuit_lcvco_switchedcapacitor,0.014706,0.047619,0.018519,0.0,0.020211
34,A. Abdul Khadar,scheduling_qos_routing,0.014706,0.047619,0.018519,0.0,0.020211
35,A. Abdul Rahim,educational_elearning_education,0.014706,0.047619,0.018519,0.0,0.020211


Delete unused kpi columns (measuring the expertise of mentees) and assign the ones that capture their preferences.

In [17]:
df_mentee_ranks.drop([lbl_num_pubs,
                      lbl_pub_rate,
                      lbl_years_exp,
                      lbl_rank], axis=1, inplace=True)
df_mentee_ranks.head()

Unnamed: 0,mentee,cluster,cluster_pref
0,(David) Jing Dai,budapest_conference_workshop,0.020211
20,A. A. Louis Beex,whitening_prewhitening_subband,0.020211
25,A. A. Shpiganovich,circuit_lcvco_switchedcapacitor,0.020211
34,A. Abdul Khadar,scheduling_qos_routing,0.020211
35,A. Abdul Rahim,educational_elearning_education,0.020211


As a convention, for the simulation we generate random preferences.

In [18]:
df_mentee_prefs = df_mentee_ranks[[lbl_mentee]].drop_duplicates()
df_mentee_prefs[lbl_num_pubs_pref] = np.random.rand(len(df_mentee_prefs))
df_mentee_prefs[lbl_pub_rate_pref] = np.random.rand(len(df_mentee_prefs))
df_mentee_prefs[lbl_years_exp_pref] = np.random.rand(len(df_mentee_prefs))
df_mentee_prefs[lbl_rank_pref] = np.random.rand(len(df_mentee_prefs))
df_mentee_prefs.head()

Unnamed: 0,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref,cite_rank_pref
0,(David) Jing Dai,0.117496,0.777833,0.529473,0.711975
20,A. A. Louis Beex,0.428107,0.493248,0.332515,0.586327
25,A. A. Shpiganovich,0.182939,0.31404,0.985632,0.088795
34,A. Abdul Khadar,0.898308,0.397429,0.322616,0.109706
35,A. Abdul Rahim,0.814334,0.07057,0.109393,0.110909


Write down dataframes as CSV files to be read in the next notebook

In [19]:
df_mentee_prefs.to_csv(f_mentee_prefs_dblp_data, sep=',', index=False, encoding='utf-8')
df_mentee_ranks.to_csv(f_mentee_topic_prefs_dblp_data, sep=',', index=False, encoding='utf-8')