# 1. Inizialize and import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import string
import nltk
import pickle
from time import time
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings(action='ignore')

from gensim.models import Word2Vec

In [2]:
sys.path.insert(0, 'pmi_utils')

In [3]:
import shared_variables
from shared_variables import *

In [4]:
shared_variables = reload(shared_variables)

In [5]:
# read the serialized sample
df_sample_dblp = pd.read_json(f_sample_dblp)
# read the whole dblp file
df_json_dblp = pd.read_json(f_json_dblp)
df_json_dblp.columns = columns
# read the whole dblp file unpacked by author
df_json_by_author = pd.read_csv(f_json_by_author)

In [6]:
df_json_dblp.columns = columns

In [7]:
df_sample_dblp.shape

(614193, 9)

In [8]:
df_json_dblp.shape

(6141934, 9)

In [9]:
df_json_by_author.shape

(13855925, 2)

In [10]:
# Remove paper ids used for training the models
paper_id_train = set(df_sample_dblp[lbl_paper_id])
df_json_dblp = df_json_dblp[~df_json_dblp[lbl_paper_id].isin(paper_id_train)]
df_json_by_author = df_json_by_author[~df_json_by_author[lbl_paper_id].isin(paper_id_train)]

In [11]:
df_json_dblp.shape

(5527741, 9)

In [12]:
df_json_by_author.shape

(12469567, 2)

In [13]:
df_dblp_by_author = pd.merge(df_json_dblp, df_json_by_author, on=lbl_paper_id)

# 2. Generate test set

Now calculate top publishing authors on a sample of 10% of the dataset (for running time efficiency).

In [15]:
df_sample_json_dblp_by_author = df_dblp_by_author.sample(frac=0.1)
df_sample_json_dblp_by_author.shape

(1246957, 10)

In [16]:
pubs_per_author = df_sample_json_dblp_by_author.groupby(lbl_author)[lbl_paper_id].count()
pubs_per_author = pubs_per_author[pubs_per_author >= 10]
pubs_per_author.describe()

count    12817.000000
mean        16.238979
std          8.715565
min         10.000000
25%         11.000000
50%         13.000000
75%         18.000000
max        137.000000
Name: paper_id, dtype: float64

In [17]:
# Only get their names
top_publishing_authors = pubs_per_author.sample(n=100).index.tolist()
# Filter dataset on them
df_test = df_sample_json_dblp_by_author[df_sample_json_dblp_by_author[lbl_author].isin(top_publishing_authors)]
df_test.shape

(1636, 10)

For each author take a sample of 20 (if possible) publications and predict their cluster, based on the pre-trained kmeans

In [18]:
selected_pubs = set()
for author, author_indices in df_test.groupby(lbl_author).groups.iteritems():
    df_test_author = df_test.loc[author_indices]
    sample_size = min(20, len(df_test_author))
    sampled_pubs = set(df_test_author[lbl_paper_id].sample(n=sample_size))
    selected_pubs = selected_pubs.union(sampled_pubs)
len(selected_pubs)

1376

In [19]:
# Filter df_test on selected pubs
df_test = df_test[df_test[lbl_paper_id].isin(selected_pubs)]
df_test.shape

(1376, 10)

In [20]:
df_test[lbl_title].head()

12397553    Conceptual Development of Mental Health Ontolo...
2962586     A multi-objective memetic algorithm based on d...
10313131    Privacy intrusion detection using dynamic Baye...
7507111     A fast IP classification algorithm applying to...
5201197     A framework for quality-based biometric classi...
Name: title, dtype: object

# 3. Testing assigned topics

Clean and preprocess text for word2vec

In [21]:
# Clean text function
def clean_text(text):
    def replace_punctuation(sentence):
        if(pd.isnull(sentence)):
            return ""
        punctuation = string.punctuation
        for p in punctuation:
            sentence = sentence.replace(p, '')
        return sentence
    # Remove punctuation
    text = text.apply(replace_punctuation)
    # Remove numbers
    text = text.apply(lambda x: ''.join([c for c in x if not c.isdigit()]))
    # Remove words of length <= 1
    text = text.apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))
    # Convert to lower case
    text = text.apply(lambda x: x.lower() if pd.notnull(x) else '')
    # Remove stopwords
    text = text.apply(lambda x: ' '.join([xi for xi in x.split() if xi not in stopw]))
    
    return text

In [22]:
words_test = clean_text(df_test[lbl_title]) \
                .apply(lambda line: line.split() if pd.notnull(line) else [])

In [23]:
#the parameter for this function are the path where the models are stored, the prefix used for the models
#With this parameter is possible to load model saved by other simply indicating the path and the prefix of 
#that model.
#We assume that a timestamp is concatenated to the model name, e.g. model_1520587779
def load_most_recent_model(dir_word2vec_models, model_prefix='model_'):
    #listing the file located in the path
    ser_models = os.listdir(dir_word2vec_models)
    #define an empty array that will be populated by timestamps
    times = []
    #for each file into the directory
    for f in ser_models:
        #if the prefix match the filename 
        if f.startswith(model_prefix) and not f.endswith('.npy'):
            #strip the name of the file. the lstrip take the last part of the filename that is the timestamp.
            #add this timestamp to the araay
            times.append(int(f.lstrip(model_prefix)))
    try:
        #try to load the most recent file, that have the max values of timestamp
        model1 = Word2Vec.load('%s%s%d' %(dir_word2vec_models, model_prefix, max(times)))
        #return that model
        return model1
    except:
        #in case of exception return None
        return None

In [24]:
model = load_most_recent_model(dir_word2vec_models, model_prefix='abstract_model_')

In [25]:
model

<gensim.models.word2vec.Word2Vec at 0x1fc4d50b8>

In [26]:
vec_len = model.vector_size
vec_len

200

In [27]:
model.wv.most_similar('network')

[(u'networks', 0.8393702507019043),
 (u'subnetwork', 0.5993532538414001),
 (u'subnet', 0.5914520025253296),
 (u'internetwork', 0.5871062874794006),
 (u'network\u2019s', 0.5801593065261841),
 (u'endhosts', 0.5662047863006592),
 (u'subnets', 0.5600781440734863),
 (u'network\u201d', 0.5502232313156128),
 (u'networking', 0.5386593341827393),
 (u'backbone', 0.5377504825592041)]

In [28]:
#the input is an array of words representing the title of the article
def avg_vector_by_words(words):
    try:
        #check if there is elements
        if len(words) > 0:
            vec_words = []
            for word in words:
                try:
                    #for each word, append to an array its associated vector
                    vec_words.append(model.wv[word])
                except:
                    pass
            if len(vec_words) > 0:
                #return the mean values of the vectors associated to the titles word
                return np.mean(vec_words, axis=0)
            return np.zeros(vec_len)
        return np.zeros(vec_len)
    except:
        return np.zeros(vec_len)

In [29]:
titles_test = words_test.apply(avg_vector_by_words)
X_test = np.asmatrix(titles_test.tolist())

In [30]:
kmeans = pickle.load(open(f_kmeans, 'r'))
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=30, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [31]:
k = kmeans.n_clusters

In [32]:
y_test = kmeans.predict(X_test)
y_test

array([16, 14, 29, ..., 21, 29,  1])

In [33]:
# Retrieve cluster names
cluster_id_name = pd.read_csv(f_cluster_id_name, sep=',', index_col=0)

In [34]:
cluster_id_name.head()

Unnamed: 0,cluster
0,robot_robotic_endeffectors
1,specification_declarative_abstractions
2,software_componentbased_enterprise
3,recognition_classification_segmentation
4,learning_learner_metalearning


In [35]:
# Assign cluster names to test papers
df_test[lbl_cluster] = [cluster_id_name.loc[x][0] for x in y_test]
df_test[[lbl_author, lbl_title, lbl_cluster]].head()

Unnamed: 0,author,title,cluster
12397553,Jitender S. Deogun,Conceptual Development of Mental Health Ontolo...,educational_elearning_education
2962586,Jing Liu,A multi-objective memetic algorithm based on d...,heuristic_algorithm_algorithms
10313131,Nick Cercone,Privacy intrusion detection using dynamic Baye...,clustering_classifier_classification
7507111,Jianping Wu,A fast IP classification algorithm applying to...,clustering_classifier_classification
5201197,Richa Singh,A framework for quality-based biometric classi...,recognition_classification_segmentation


In [36]:
# Assign clusters to authors and compare with the train dataset ones
df_author_test = df_test[[lbl_author, lbl_cluster]].drop_duplicates()
df_author_test.head()

Unnamed: 0,author,cluster
12397553,Jitender S. Deogun,educational_elearning_education
2962586,Jing Liu,heuristic_algorithm_algorithms
10313131,Nick Cercone,clustering_classifier_classification
7507111,Jianping Wu,clustering_classifier_classification
5201197,Richa Singh,recognition_classification_segmentation


In [37]:
df_author_test.shape

(714, 2)

In [38]:
df_author_train = pd.read_csv(f_author_ranks, sep=',', usecols=[lbl_author, lbl_cluster])
df_author_train.head()

Unnamed: 0,author,cluster
0,(David) Jing Dai,budapest_conference_workshop
1,A Min Tjoa,analysis_methodology_modeling
2,A Min Tjoa,data_metadata_database
3,A Min Tjoa,educational_elearning_education
4,A-Long Jin,channel_multiantenna_mimo


In [39]:
df_author_train.shape

(310144, 2)

In [40]:
# Filter training set by authors in test set
df_author_train = df_author_train[df_author_train[lbl_author].isin(df_author_test[lbl_author])]
df_author_train.shape

(321, 2)

In [41]:
df_author_train.head(10)

Unnamed: 0,author,cluster
6646,Alessandro E. P. Villa,control_closedloop_controller
9093,Ali Jadbabaie,control_closedloop_controller
9094,Ali Jadbabaie,heuristic_algorithm_algorithms
9095,Ali Jadbabaie,learning_learner_metalearning
9096,Ali Jadbabaie,network_multihop_wireless
9097,Ali Jadbabaie,scheduling_qos_routing
9098,Ali Jadbabaie,services_infrastructure_middleware
10411,Amar Mitiche,data_metadata_database
10412,Amar Mitiche,microsphere_transmural_peristalsis
18038,Antonis A. Argyros,image_texture_images


Check if test set (author, cluster) pairs are contained in the training set

In [42]:
df_author_train_test = pd.merge(df_author_train, df_author_test, on=[lbl_author, lbl_cluster])
df_author_train_test.shape

(199, 2)

In [43]:
df_author_train_test.head()

Unnamed: 0,author,cluster
0,Alessandro E. P. Villa,control_closedloop_controller
1,Ali Jadbabaie,control_closedloop_controller
2,Ali Jadbabaie,scheduling_qos_routing
3,Antonis A. Argyros,image_texture_images
4,Antonis A. Argyros,interactive_interaction_mixedreality


Performance could be measured as: *(size of intersection between train and test) / (minimum between train and test sizes)*

In [44]:
topic_inference_performance = 1. * len(df_author_train_test) / \
                                min(len(df_author_train), len(df_author_test))
topic_inference_performance

0.6199376947040498

# 4. Testing recommendation algorithm

Now fix an author from the test set, make it a mentee and see to which mentor it is matched to

In [45]:
mentee_test = df_author_train_test.sample(n=1)[lbl_author].iloc[0]
mentee_test

'Maurice Herlihy'

In [46]:
df_author_train_test[df_author_train_test[lbl_author] == mentee_test]

Unnamed: 0,author,cluster
89,Maurice Herlihy,hardware_multiprocessor_multicore
90,Maurice Herlihy,infinitary_finitary_equational
179,Maurice Herlihy,network_multihop_wireless
180,Maurice Herlihy,scheduling_qos_routing


Load the mentor ranks

In [47]:
df_mentor_ranks = pd.read_csv(f_mentor_ranks, sep=',')
df_mentor_ranks.head()

Unnamed: 0,mentor,cluster,num_pubs,pub_rate,years_exp,cite_rank
0,A Min Tjoa,analysis_methodology_modeling,0.014706,0.047619,0.018519,0.0
1,A Min Tjoa,data_metadata_database,0.088235,0.047619,0.388889,0.01449
2,A Min Tjoa,educational_elearning_education,0.029412,0.047619,0.203704,0.000479
3,A-Long Jin,channel_multiantenna_mimo,0.014706,0.047619,0.018519,0.0
4,A-Long Jin,services_infrastructure_middleware,0.014706,0.047619,0.018519,0.0


In [48]:
df_mentor_ranks[df_mentor_ranks[lbl_mentor] == mentee_test]

Unnamed: 0,mentor,cluster,num_pubs,pub_rate,years_exp,cite_rank
137208,Maurice Herlihy,hardware_multiprocessor_multicore,0.147059,0.047619,0.240741,0.0
137209,Maurice Herlihy,infinitary_finitary_equational,0.044118,0.047619,0.481481,0.000287
137210,Maurice Herlihy,llamada_restent_keinen,0.044118,0.095238,0.351852,0.0
273049,Maurice Herlihy,network_multihop_wireless,0.0,0.0,0.0,0.008829
273050,Maurice Herlihy,scheduling_qos_routing,0.0,0.0,0.0,0.012771


Let's construct a mentee preference profile such that it is mostly interested into the topic *infinitary_finitary_equational* and assigns higher weights to the kpi *years_exp*.

In [49]:
# Slighlty change name of mentee_test in order to distinguish it from the mentor
mentee_test += '_mentee'

In [50]:
df_mentee_test_prefs = pd.DataFrame({lbl_mentee: [mentee_test],
                                     lbl_num_pubs_pref: [0.1],
                                     lbl_pub_rate_pref: [0.1],
                                     lbl_years_exp_pref: [0.8],
                                     lbl_rank_pref: [0]})
df_mentee_test_prefs.head()

Unnamed: 0,cite_rank_pref,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref
0,0,Maurice Herlihy_mentee,0.1,0.1,0.8


In [51]:
df_mentee_test_topic_prefs = pd.DataFrame(
                                [[mentee_test, 'infinitary_finitary_equational', 7],
                                 [mentee_test, 'network_multihop_wireless', 2],
                                 [mentee_test, 'llamada_restent_keinen', 1]],
                                columns=[lbl_mentee, lbl_cluster, lbl_cluster_pref])
df_mentee_test_topic_prefs.head()

Unnamed: 0,mentee,cluster,cluster_pref
0,Maurice Herlihy_mentee,infinitary_finitary_equational,7
1,Maurice Herlihy_mentee,network_multihop_wireless,2
2,Maurice Herlihy_mentee,llamada_restent_keinen,1


Calculate the topic-mentor weight based on mentee_test prefs

In [52]:
df_mentee_test_prefs

Unnamed: 0,cite_rank_pref,mentee,num_pubs_pref,pub_rate_pref,years_exp_pref
0,0,Maurice Herlihy_mentee,0.1,0.1,0.8


In [53]:
def weighted_avg(mentor_row):
    mentee_row = df_mentee_test_prefs.iloc[0]
    avg = mentee_row[lbl_num_pubs_pref] * mentor_row[lbl_num_pubs]
    avg += mentee_row[lbl_pub_rate_pref] * mentor_row[lbl_pub_rate]
    avg += mentee_row[lbl_years_exp_pref] * mentor_row[lbl_years_exp]
    avg += mentee_row[lbl_rank_pref] * mentor_row[lbl_rank]
    return avg

In [54]:
mentee_clusters = set(df_mentee_test_topic_prefs[lbl_cluster])

df_mentee_mentor_ranks = df_mentor_ranks[df_mentor_ranks[lbl_cluster].isin(mentee_clusters)]
df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = df_mentee_mentor_ranks.apply(weighted_avg, axis=1)
df_mentee_mentor_ranks.drop([lbl_num_pubs,
                             lbl_pub_rate,
                             lbl_years_exp,
                             lbl_rank], axis=1, inplace=True)
df_mentee_mentor_ranks.head()

# Normalize and reverse the score values since we are going to find shortest paths from mentee to mentors
scaler = MinMaxScaler()
df_mentee_mentor_ranks[lbl_cluster_mentor_weight] = 1 - scaler.fit_transform(
                                                            df_mentee_mentor_ranks[lbl_cluster_mentor_weight])
df_mentee_test_topic_prefs[lbl_cluster_pref] = 1 - scaler.fit_transform(
                                                            df_mentee_test_topic_prefs[lbl_cluster_pref])
# Construct the mentee-topics-mentors graph
# Initialize an empty directed graph
G = nx.DiGraph()
# Link the mentee to the topics based on his preferences
# Array of triplets containing (source, destination, weight)
mentee_topics = df_mentee_test_topic_prefs[[lbl_mentee, lbl_cluster, lbl_cluster_pref]].values
G.add_weighted_edges_from(mentee_topics)
# Link topics (clusters) to mentors and assign weight
topics_mentors = df_mentee_mentor_ranks[[lbl_cluster,
                                         lbl_mentor,
                                         lbl_cluster_mentor_weight]].values
G.add_weighted_edges_from(topics_mentors)
G.add_weighted_edges_from([(mentor, lbl_sink, 0) for mentor in set(df_mentee_mentor_ranks[lbl_mentor])])

# Calculate all shortest paths from mentee to super sink
shortest_paths = [p for p in nx.all_shortest_paths(G, source=mentee_test, target=lbl_sink, weight='weight')]

# In case there are multiple choices of "best" mentors, we can select 10 random ones among them.
n_mentor_choices = min(10, len(shortest_paths))
mentors_indices = np.random.randint(len(shortest_paths), size=n_mentor_choices)
mentors_chosen = np.array(shortest_paths, dtype=str)[mentors_indices, 2]

In [55]:
shortest_paths

[['Maurice Herlihy_mentee',
  'infinitary_finitary_equational',
  'Janusz A. Brzozowski',
  'sink']]

The mentor, who became a mentee for testing purposes, has been assigned to the best expert in *infinitary_finitary_equational* as expected, due to having weighted that the most. He's not been assigned to himself since he's, in the end, less expert than many other guys.

In [56]:
df_mentee_mentor_ranks[
    df_mentee_mentor_ranks[lbl_cluster] == shortest_paths[0][1]] \
    .sort_values(by=lbl_cluster_mentor_weight).head()

Unnamed: 0,mentor,cluster,cluster_mentor_weight
89655,Janusz A. Brzozowski,infinitary_finitary_equational,0.0
218248,Werner Kuich,infinitary_finitary_equational,0.03065
18436,Arto Salomaa,infinitary_finitary_equational,0.031076
157441,Oscar H. Ibarra,infinitary_finitary_equational,0.08152
103432,Jozef Gruska,infinitary_finitary_equational,0.176781


In [57]:
df_mentee_mentor_ranks[df_mentee_mentor_ranks[lbl_mentor] == 'Maurice Herlihy']

Unnamed: 0,mentor,cluster,cluster_mentor_weight
137209,Maurice Herlihy,infinitary_finitary_equational,0.520687
137210,Maurice Herlihy,llamada_restent_keinen,0.640943
273049,Maurice Herlihy,network_multihop_wireless,1.0
