# 1. Initialize and import libraries

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import sys
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore') 

In [2]:
sys.path.insert(0, 'pmi_utils')

In [3]:
import shared_variables
from shared_variables import *

In [4]:
shared_variables = reload(shared_variables)

# 2. Load mentor and mentee data

In [5]:
# Pick a subset of mentees to analyze from the dblp authors
def load_from_author_data():
    df_mentee_prefs = pd.read_csv(f_mentee_prefs_dblp_data, sep=',')
    df_mentee_topic_prefs = pd.read_csv(f_mentee_topic_prefs_dblp_data)
    return df_mentee_prefs, df_mentee_topic_prefs

In [6]:
def load_input_mentees(from_file=True):
    if from_file:
        try:
            # Load mentees preferences over mentors' kpis
            df_mentee_prefs = pd.read_excel(f_mentee_prefs_excel, sheetname=[mentee_prefs_page,
                                                                             mentee_topic_prefs_page])
            df_mentee_topic_prefs = df_mentee_prefs[mentee_topic_prefs_page]
            df_mentee_prefs = df_mentee_prefs[mentee_prefs_page]
            
            return df_mentee_prefs, df_mentee_topic_prefs
        except:
            return load_from_author_data()
    return load_from_author_data()

In [7]:
df_mentee_prefs, df_mentee_topic_prefs = load_input_mentees()

In [8]:
df_mentee_prefs.head()

Unnamed: 0,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref,cite_rank_pref
0,A. Einstein,0.011097,0.619665,0.398425,0.496162
1,J. Smith,0.866261,0.726914,0.877906,0.553403
2,E. Priesley,0.13649,0.148647,0.594188,0.465706
3,B. Gates,0.359991,0.036693,0.868773,0.928226
4,J. Caesar,0.510581,0.761034,0.125807,0.059358


In [9]:
df_mentee_topic_prefs.head()

Unnamed: 0,mentee,cluster,cluster_pref
0,A. Einstein,learning_learner_metalearning,0.740571
1,A. Einstein,secure_encryption_authentication,0.290575
2,A. Einstein,robot_robotic_endeffectors,0.119066
3,J. Smith,heuristic_algorithm_algorithms,0.703593
4,J. Smith,specification_declarative_abstractions,0.89168


In [10]:
def as_probability_distribution(values):
    values_sum = sum(values)
    return [1. * v / values_sum for v in values]

In [11]:
# Rescale values on the 4 columns as probability distribution
involved_cols = [lbl_num_pubs_pref,
                 lbl_pub_rate_pref,
                 lbl_years_exp_pref,
                 lbl_rank_pref]
df_mentee_prefs[involved_cols] = df_mentee_prefs[involved_cols] \
                                    .apply(as_probability_distribution, axis=1)

In [12]:
df_mentee_prefs.head()

Unnamed: 0,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref,cite_rank_pref
0,A. Einstein,0.007275,0.406245,0.261203,0.325278
1,J. Smith,0.286416,0.240343,0.290266,0.182974
2,E. Priesley,0.101477,0.110515,0.441765,0.346242
3,B. Gates,0.164103,0.016727,0.396034,0.423136
4,J. Caesar,0.350486,0.522409,0.086359,0.040746


In [13]:
df_mentee_topic_prefs.head()

Unnamed: 0,mentee,cluster,cluster_pref
0,A. Einstein,learning_learner_metalearning,0.740571
1,A. Einstein,secure_encryption_authentication,0.290575
2,A. Einstein,robot_robotic_endeffectors,0.119066
3,J. Smith,heuristic_algorithm_algorithms,0.703593
4,J. Smith,specification_declarative_abstractions,0.89168


Now, construct a graph that connects one mentee to his favorite topics and to mentors, in order to find the one that best matches his preferences.

# 3. Mentor-mentee recommendation through network analysis

In [14]:
# To test the algorithm, fix a mentee with at least 2 topics of interest
grouped = df_mentee_topic_prefs.groupby(lbl_mentee).size()
mentee_name = grouped[grouped >= 2].index.tolist()[0]
mentee_name

u'A. Einstein'

In [15]:
# Fix a mentee, in general we should loop over all of them
df_mentee_topic_prefs_example = df_mentee_topic_prefs[df_mentee_topic_prefs[lbl_mentee] == mentee_name]
df_mentee_prefs_example = df_mentee_prefs[df_mentee_prefs[lbl_mentee] == mentee_name]

In [16]:
df_mentee_topic_prefs_example

Unnamed: 0,mentee,cluster,cluster_pref
0,A. Einstein,learning_learner_metalearning,0.740571
1,A. Einstein,secure_encryption_authentication,0.290575
2,A. Einstein,robot_robotic_endeffectors,0.119066


In [17]:
df_mentee_prefs_example

Unnamed: 0,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref,cite_rank_pref
0,A. Einstein,0.007275,0.406245,0.261203,0.325278


Load mentors and their expertise level ranks.

In [18]:
df_mentor_ranks = pd.read_csv(f_mentor_ranks, sep=',')

In [19]:
mentee_clusters = set(df_mentee_topic_prefs_example[lbl_cluster])
df_mentee_mentor_ranks = df_mentor_ranks[df_mentor_ranks[lbl_cluster].isin(mentee_clusters)]
df_mentee_mentor_ranks.head()

Unnamed: 0,mentor,cluster,num_pubs,pub_rate,years_exp,cite_rank
34,A. Affanni,robot_robotic_endeffectors,0.014706,0.047619,0.018519,0.0
37,A. Agung Julius,robot_robotic_endeffectors,0.029412,0.047619,0.037037,0.0
57,A. Ashok Kumar,robot_robotic_endeffectors,0.014706,0.047619,0.018519,0.0
67,A. B. M. Alim Al Islam,secure_encryption_authentication,0.014706,0.047619,0.018519,0.0
111,A. D. Amar,learning_learner_metalearning,0.014706,0.047619,0.018519,0.0


In [20]:
mentee_clusters

{u'learning_learner_metalearning',
 u'robot_robotic_endeffectors',
 u'secure_encryption_authentication'}

Now calculate mentors' scores based on mentee's prefs.

In [21]:
def weighted_avg(mentor_row):
    mentee_row = df_mentee_prefs_example.iloc[0]
    avg = mentee_row[lbl_num_pubs_pref] * mentor_row[lbl_num_pubs]
    avg += mentee_row[lbl_pub_rate_pref] * mentor_row[lbl_pub_rate]
    avg += mentee_row[lbl_years_exp_pref] * mentor_row[lbl_years_exp]
    avg += mentee_row[lbl_rank_pref] * mentor_row[lbl_rank]
    return avg

df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = df_mentee_mentor_ranks.apply(weighted_avg, axis=1)
df_mentee_mentor_ranks.head()

Unnamed: 0,mentor,cluster,num_pubs,pub_rate,years_exp,cite_rank,cluster_mentor_weight
34,A. Affanni,robot_robotic_endeffectors,0.014706,0.047619,0.018519,0.0,0.024289
37,A. Agung Julius,robot_robotic_endeffectors,0.029412,0.047619,0.037037,0.0,0.029233
57,A. Ashok Kumar,robot_robotic_endeffectors,0.014706,0.047619,0.018519,0.0,0.024289
67,A. B. M. Alim Al Islam,secure_encryption_authentication,0.014706,0.047619,0.018519,0.0,0.024289
111,A. D. Amar,learning_learner_metalearning,0.014706,0.047619,0.018519,0.0,0.024289


Now remove the four KPI columns as we have obtained an aggregate score.

In [22]:
df_mentee_mentor_ranks.drop([lbl_num_pubs,
                             lbl_pub_rate,
                             lbl_years_exp,
                             lbl_rank], axis=1, inplace=True)
df_mentee_mentor_ranks.head()

Unnamed: 0,mentor,cluster,cluster_mentor_weight
34,A. Affanni,robot_robotic_endeffectors,0.024289
37,A. Agung Julius,robot_robotic_endeffectors,0.029233
57,A. Ashok Kumar,robot_robotic_endeffectors,0.024289
67,A. B. M. Alim Al Islam,secure_encryption_authentication,0.024289
111,A. D. Amar,learning_learner_metalearning,0.024289


Normalize and reverse the score values since we are going to find shortest paths from mentee to mentors

In [23]:
scaler = MinMaxScaler()
df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = 1 - scaler.fit_transform(
                                                            df_mentee_mentor_ranks[lbl_cluster_mentor_weight])
df_mentee_topic_prefs_example[lbl_cluster_pref] = 1 - scaler.fit_transform(
                                                            df_mentee_topic_prefs_example[lbl_cluster_pref])

## Construct the mentee-topics-mentors graph

In [24]:
# Initialize an empty directed graph
G = nx.DiGraph()

Link the mentee to the topics based on his preferences

In [25]:
# Array of triplets containing (source, destination, weight)
mentee_topics = df_mentee_topic_prefs_example[[lbl_mentee, lbl_cluster, lbl_cluster_pref]].values
G.add_weighted_edges_from(mentee_topics)

In [26]:
G.nodes()

[u'robot_robotic_endeffectors',
 u'learning_learner_metalearning',
 u'secure_encryption_authentication',
 u'A. Einstein']

In [27]:
G.edge

{u'A. Einstein': {u'learning_learner_metalearning': {'weight': 0.0},
  u'robot_robotic_endeffectors': {'weight': 1.0},
  u'secure_encryption_authentication': {'weight': 0.7240430403540213}},
 u'learning_learner_metalearning': {},
 u'robot_robotic_endeffectors': {},
 u'secure_encryption_authentication': {}}

Link topics (clusters) to mentors and assign weight

In [28]:
topics_mentors = df_mentee_mentor_ranks[[lbl_cluster,
                                         lbl_mentor,
                                         lbl_cluster_mentor_weight]].values
G.add_weighted_edges_from(topics_mentors)

In [29]:
topics_mentors[:5]

array([['robot_robotic_endeffectors', 'A. Affanni', 0.9275838665326375],
       ['robot_robotic_endeffectors', 'A. Agung Julius',
        0.9128245332841624],
       ['robot_robotic_endeffectors', 'A. Ashok Kumar',
        0.9275838665326375],
       ['secure_encryption_authentication', 'A. B. M. Alim Al Islam',
        0.9275838665326375],
       ['learning_learner_metalearning', 'A. D. Amar',
        0.9275838665326375]], dtype=object)

In [30]:
len(G.edges())

21701

Now add a super node linking all mentors, that will act as a sink for the paths of the mentees

In [31]:
lbl_sink = 'sink'

In [32]:
G.add_weighted_edges_from(
    [(mentor, lbl_sink, 0) for mentor in set(df_mentee_mentor_ranks[lbl_mentor])])

In [33]:
len(G.edges())

42854

In [34]:
shortest_paths = [p for p in nx.all_shortest_paths(G, source=mentee_name, target=lbl_sink, weight='weight')]

In [35]:
len(shortest_paths)

1

This is one of the shortest paths from mentee to topic to mentor

In [36]:
shortest_paths[0]

[u'A. Einstein', u'learning_learner_metalearning', 'Ronald R. Yager', 'sink']

In [37]:
df_mentee_mentor_ranks[
    df_mentee_mentor_ranks[lbl_cluster] == shortest_paths[0][1]] \
    .sort_values(by=lbl_cluster_mentor_weight).head()

Unnamed: 0,mentor,cluster,cluster_mentor_weight
177410,Ronald R. Yager,learning_learner_metalearning,0.420118
117790,Lawrence Carin,learning_learner_metalearning,0.47744
88252,James C. Lester,learning_learner_metalearning,0.486434
66663,Gerhard Widmer,learning_learner_metalearning,0.490444
186158,Sergey Levine,learning_learner_metalearning,0.49222


In [38]:
df_mentee_mentor_ranks[df_mentee_mentor_ranks[lbl_cluster_mentor_weight] == 0].shape

(1, 3)

In [39]:
set(df_mentee_mentor_ranks[df_mentee_mentor_ranks[lbl_cluster_mentor_weight] == 0][lbl_cluster])

{'secure_encryption_authentication'}

In case there are multiple choices of "best" mentors, we can select 10 random ones among them.

In [40]:
n_mentor_choices = min(10, len(shortest_paths))
mentors_indices = np.random.randint(len(shortest_paths), size=n_mentor_choices)
mentors_chosen = np.array(shortest_paths, dtype=str)[mentors_indices, 2]
mentors_chosen

array(['Ronald R. Yager'], dtype='|S29')

In [41]:
mentee_mentors_match = pd.DataFrame([[mentee_name, mentor] for mentor in mentors_chosen],
                                    columns=[lbl_mentee, lbl_mentor])
mentee_mentors_match.head()

Unnamed: 0,mentee,mentor
0,A. Einstein,Ronald R. Yager


In [42]:
f_mentee_mentors_match = path + 'mentee_mentors_match.csv'
mentee_mentors_match.to_csv(f_mentee_mentors_match, sep=',', index=False, encoding='utf-8')

## Simulation over a set of mentees from the dblp dataset

In [43]:
df_mentee_prefs, df_mentee_topic_prefs = load_input_mentees(False)

In [44]:
df_mentee_prefs.head()

Unnamed: 0,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref,cite_rank_pref
0,(David) Jing Dai,0.117496,0.777833,0.529473,0.711975
1,A. A. Louis Beex,0.428107,0.493248,0.332515,0.586327
2,A. A. Shpiganovich,0.182939,0.31404,0.985632,0.088795
3,A. Abdul Khadar,0.898308,0.397429,0.322616,0.109706
4,A. Abdul Rahim,0.814334,0.07057,0.109393,0.110909


In [45]:
df_mentee_topic_prefs.head()

Unnamed: 0,mentee,cluster,cluster_pref
0,(David) Jing Dai,budapest_conference_workshop,0.020211
1,A. A. Louis Beex,whitening_prewhitening_subband,0.020211
2,A. A. Shpiganovich,circuit_lcvco_switchedcapacitor,0.020211
3,A. Abdul Khadar,scheduling_qos_routing,0.020211
4,A. Abdul Rahim,educational_elearning_education,0.020211


For the simulation, select the top 300 mentees by number of topics preferred.

In [46]:
n_mentees = 300
mentee_names = df_mentee_topic_prefs.groupby(lbl_mentee).size()\
                    .sort_values(ascending=False) \
                    .index.tolist()[:n_mentees]
mentee_names[:10]

['Alwin Zulehner',
 'Yong Zhang 0012',
 'Richard A. Farneth',
 'Yoshio Rubio',
 'Philipp Terh\xc3\xb6rst',
 'Siyu Liao',
 'Rajesh Kumar M',
 'Miralda Cuka',
 'Yuying Zhu',
 'Klayton Castro']

Run the mentor-mentee recommendation algorithm on the selected mentees and write the result to a CSV file

In [47]:
mentee_mentors_match = pd.DataFrame(columns=[lbl_mentee, lbl_mentor])

for mentee_name in mentee_names:
    # Filter by mentee_name
    df_mentee_topic_prefs_example = df_mentee_topic_prefs[df_mentee_topic_prefs[lbl_mentee] == mentee_name]
    df_mentee_prefs_example = df_mentee_prefs[df_mentee_prefs[lbl_mentee] == mentee_name]   
    mentee_clusters = set(df_mentee_topic_prefs_example[lbl_cluster])
    
    df_mentee_mentor_ranks = df_mentor_ranks[df_mentor_ranks[lbl_cluster].isin(mentee_clusters)]
    df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = df_mentee_mentor_ranks.apply(weighted_avg, axis=1)
    df_mentee_mentor_ranks.drop([lbl_num_pubs,
                                 lbl_pub_rate,
                                 lbl_years_exp,
                                 lbl_rank], axis=1, inplace=True)
    df_mentee_mentor_ranks.head()

    # Normalize and reverse the score values since we are going to find shortest paths from mentee to mentors
    scaler = MinMaxScaler()
    df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = 1 - scaler.fit_transform(
                                                                df_mentee_mentor_ranks[lbl_cluster_mentor_weight])
    df_mentee_topic_prefs_example[lbl_cluster_pref] = 1 - scaler.fit_transform(
                                                                df_mentee_topic_prefs_example[lbl_cluster_pref])
    # Construct the mentee-topics-mentors graph
    # Initialize an empty directed graph
    G = nx.DiGraph()
    # Link the mentee to the topics based on his preferences
    # Array of triplets containing (source, destination, weight)
    mentee_topics = df_mentee_topic_prefs_example[[lbl_mentee, lbl_cluster, lbl_cluster_pref]].values
    G.add_weighted_edges_from(mentee_topics)
    # Link topics (clusters) to mentors and assign weight
    topics_mentors = df_mentee_mentor_ranks[[lbl_cluster,
                                             lbl_mentor,
                                             lbl_cluster_mentor_weight]].values
    G.add_weighted_edges_from(topics_mentors)
    G.add_weighted_edges_from([(mentor, lbl_sink, 0) for mentor in set(df_mentee_mentor_ranks[lbl_mentor])])

    # Calculate all shortest paths from mentee to super sink
    shortest_paths = [p for p in nx.all_shortest_paths(G, source=mentee_name, target=lbl_sink, weight='weight')]
    
    # In case there are multiple choices of "best" mentors, we can select 10 random ones among them.
    n_mentor_choices = min(10, len(shortest_paths))
    mentors_indices = np.random.randint(len(shortest_paths), size=n_mentor_choices)
    mentors_chosen = np.array(shortest_paths, dtype=str)[mentors_indices, 2]

    mentee_mentors_match = pd.concat([mentee_mentors_match,
                                      pd.DataFrame([[mentee_name, mentor] for mentor in mentors_chosen],
                                        columns=[lbl_mentee, lbl_mentor])], ignore_index=True)
                      
# Write result to file
f_mentee_mentors_match = path + 'mentee_mentors_match.csv'
mentee_mentors_match.to_csv(f_mentee_mentors_match, sep=',', index=False, encoding='utf-8')