In [1]:
import networkx as nx
import nltk
import matplotlib as plt
import os
import pandas as pd
import regex as re
import numpy as np
import itertools
from difflib import SequenceMatcher

In [2]:
# Reading the file. "DiGraph" is telling to reading the data with node-node. "nodetype" will identify whether the node is number or string or any other type.

g = nx.read_edgelist("cit-HepTh.txt",create_using=nx.DiGraph(), nodetype = int)

# check if the data has been read properly or not.

nx.info(g)

# count the number of nodes

g.number_of_nodes()

# number of self-nodes

#g.selfloop_edges()

27770

In [3]:
# nx.write_edgelist(g, "cit_hepth.edgelist")
# nx.write_gexf(g, "cit-hepth.gexf")

In [4]:
# Get all the authors, abstracts etc
rootdir = '/Users/juliavanoosten/Documents/Studie/Msc. Applied Data Science/ADS Thesis/cit-HepTh/cit-HepTh-abstracts'
abstracts = []

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print(os.path.join(subdir, file))
        path = os.path.join(subdir, file)
        with open(path) as f:
            try:
                lines = f.readlines()
                lines = ' '.join(lines)
                paper = re.search(r"(?<=Paper:)(((.|\n)*))(?=From:)",lines).group(0)
                #date = re.search(r"(?<=Date:)(((.|\n)*))(?=Title:)",lines).group(0)
                title = re.search(r"(?<=Title:)(((.|\n)*))(?=Authors:)",lines).group(0)
                authors = re.search(r"(?<=Authors:)(((.|\n)*))(?=Comments:)",lines).group(0)
                #comments = re.search(r"(?<=Comments:)(((.|\n)*))(?=\\{3,})",lines).group(0)
#                 article = {"paper":paper, "date":date,"title":title,"authors":authors,"comments":comments}
                article = {"paper":paper,"title":title,"authors":authors}
                abstracts.append(article.copy())
            except:
                continue


In [5]:
# Create dataframe from abstracts
df =  pd.DataFrame(abstracts)

In [6]:
# Some Cleaning
# Remove newlines
df = df.applymap(lambda x: str(x).replace("\n",""))
df["paper"] = df["paper"].apply(lambda x: str(x).replace("hep-th/",""))
df["authors"] = df["authors"].apply(lambda x: re.sub('\d', '', x))

# Split multiple authors and create multiple columnns
df["authors"] = df["authors"].apply(lambda x: re.split(", | and ",x))
author_df = pd.DataFrame(df["authors"].values.tolist()).add_prefix('author_')

In [7]:
# find the cause of 21 authors
df["num_authors"] = df["authors"].apply(lambda x: len(x)).sort_values()
df.sort_values(by="num_authors",ascending = False).head()

Unnamed: 0,paper,title,authors,num_authors
23911,208183,Generalized Bogoliubov Transformation for Con...,"[ J.C. da Silva (,), F.C. Khanna (,), A. Matos...",21
13942,9706081,Geometry of dynamics and phase transitions in...,"[ Lando Caiani (), Lapo Casetti (, ), Cecilia ...",20
21087,207082,Relativistic invariant Lie algebras for kinem...,"[ V. V. Khruschev (), A. N. Leznov (, , ) (() ...",18
22321,207152,"Boundary One-Point Functions, Scattering, and...","[ V.A. Fateev (, ), E. Onofri (, ) (() Laborat...",17
20809,9603139,Lie Algebras of Differential Operators and Pa...,"[ Federico Finkel(), Artemio Gonzalez-Lopez(),...",16


In [8]:
# Merge the author_df and the other one back together
combined_df = pd.concat([df, author_df], axis=1)
combined_df.head()

Unnamed: 0,paper,title,authors,num_authors,author_0,author_1,author_2,author_3,author_4,author_5,...,author_11,author_12,author_13,author_14,author_15,author_16,author_17,author_18,author_19,author_20
0,9301112,On Integrable c<1 Open--Closed String Theory,[ Clifford V. Johnson ],1,Clifford V. Johnson,,,,,,...,,,,,,,,,,
1,9303063,Schwinger Effect in String Theory,[ C.Bachas ],1,C.Bachas,,,,,,...,,,,,,,,,,
2,9308136,Proof of Jacobi identity in generalized quant...,"[ S.L. Adler, G.V. Bhanot, J.D. Weckel ]",3,S.L. Adler,G.V. Bhanot,J.D. Weckel,,,,...,,,,,,,,,,
3,9308122,"Mirror Symmetry, Mirror Map and Applications ...","[ S. Hosono, A. Klemm, S. Theisen ]",3,S. Hosono,A. Klemm,S. Theisen,,,,...,,,,,,,,,,
4,9303077,Abelian Anomalies in Nonlocal Regularization,"[ M. A. Clayton, L. Demopoulos, J. W. Moffat ]",3,M. A. Clayton,L. Demopoulos,J. W. Moffat,,,,...,,,,,,,,,,


In [9]:
# Check how many other outlier author formats are there
# Leave them out?
df["num_authors"].value_counts()

2     9441
1     7312
3     5298
4     1989
5      481
6      238
7       68
8       33
9       19
11       8
10       6
12       3
13       2
16       2
20       1
18       1
17       1
21       1
Name: num_authors, dtype: int64

In [10]:
# Find all unique authors and put them in a list
unique_authors = []
for column,data in author_df.iteritems():
    unique_authors.append(author_df[column].unique())

In [11]:
# TO DO: add cleaning + STRIP
unique_authors = list(set(np.concatenate(unique_authors).ravel().tolist()))

print("We have",sum(df["num_authors"]),"authors in the network of which",len(unique_authors),"are unique at first glance")
unique_authors[:20]

We have 55106 authors in the network of which 25466 are unique at first glance


['',
 'Kiyokazu Nagatomo (Osaka   University) ',
 'C. Ungarelli',
 ' A. Wipf',
 'and Masaki Shigemori ',
 ' V. Fateev',
 'Thomas Mohaupt',
 ' A. Giveon',
 ' Dietrich B\\"odeker',
 ' J.M. Isidro ',
 ' S. Sethi ',
 'C. D. Fosco',
 ' B. Kleihaus',
 'Kumar Rao ',
 'F. Rodenas ',
 ' RJ Cova ',
 'Seok-Jin Kang',
 'A. Jevicki',
 'Anton   Rebhan ',
 ' Masao Jinzenji']

In [12]:
# Remove nones, "and", university details and empty strings
def clean_authors(unique_authors):

    unique_authors = list(filter(None, unique_authors))
    unique_authors = [elem.replace("and ","") for elem in unique_authors]
    unique_authors = [elem.lower().strip() for elem in unique_authors]
    unique_authors = [re.sub(r'\(([^\)]+)\)', '', elem) for elem in unique_authors]
    return unique_authors

unique_authors = clean_authors(unique_authors)
# Create pairs
# author_pairs = list(itertools.combinations(unique_authors, 2))

In [13]:
def name(s):
  
    # split the string into a list 
    l = s.split()
    new = ""
  
    try:
        # traverse in the list 
        for i in range(len(l)-1):
            s = l[i]

            # adds the capital first character 
            new += (s[0].upper()+'. ')

        # l[-1] gives last item of list l. We
        # use title to print first character in
        # capital.
        new += l[-1].title()

        return new
    except:
        return s

In [14]:
#Get authors in universal format
unique_authors_df = pd.DataFrame(unique_authors)
unique_authors_df["initial_lastname"] = unique_authors_df.applymap(lambda x: name(x))

In [15]:
unique_authors_df.rename(columns={0: "full_name"},inplace=True)

In [16]:
len(unique_authors_df["full_name"])

25464

In [17]:
unique_authors_df.head()

Unnamed: 0,full_name,initial_lastname
0,kiyokazu nagatomo,K. Nagatomo
1,c. ungarelli,C. Ungarelli
2,a. wipf,A. Wipf
3,masaki shigemori,M. Shigemori
4,v. fateev,V. Fateev


In [18]:
combined_df.columns

Index(['paper', 'title', 'authors', 'num_authors', 'author_0', 'author_1',
       'author_2', 'author_3', 'author_4', 'author_5', 'author_6', 'author_7',
       'author_8', 'author_9', 'author_10', 'author_11', 'author_12',
       'author_13', 'author_14', 'author_15', 'author_16', 'author_17',
       'author_18', 'author_19', 'author_20'],
      dtype='object')

In [19]:
unique_authors_df['index_nr'] = range(0, len(unique_authors_df))

In [20]:
%%time

# Get the paper_ids to find out on which papers the unique authors have worked
paper_ids = []
combined_df_clean = combined_df.copy()

combined_df_clean["authors"] =combined_df_clean["authors"].apply(lambda x: clean_authors(x))
combined_df_clean["authors"]= combined_df_clean["authors"].apply(lambda x: " ".join(x))
combined_df_clean.head()
 
    
for author in unique_authors_df["full_name"]:
    try:
        paper_ids.append(list(combined_df_clean[combined_df_clean["authors"].str.contains(author)]["paper"]))
    except:
        paper_ids.append(np.nan)

  if sys.path[0] == '':


CPU times: user 6min 19s, sys: 2.88 s, total: 6min 22s
Wall time: 6min 31s


In [21]:
print(len(unique_authors_df), len(paper_ids))

25464 25464


In [22]:
unique_authors_df["affiliated_papers"] = paper_ids

In [23]:
unique_authors_df.head()

Unnamed: 0,full_name,initial_lastname,index_nr,affiliated_papers
0,kiyokazu nagatomo,K. Nagatomo,0,"[ 9704060 , 9706118 ]"
1,c. ungarelli,C. Ungarelli,1,"[ 9710188 , 9701146 , 9707053 , 9706221 ]"
2,a. wipf,A. Wipf,2,"[ 9308130 , 9310085 , 9306161 , 9308067 , ..."
3,masaki shigemori,M. Shigemori,3,"[ 0110035 , 0206080 , 0304138 , 0303104 ]"
4,v. fateev,V. Fateev,4,"[ 0001012 , 9709034 , 9702190 ]"


In [24]:
unique_authors_df.reset_index(inplace=True)

### LSH

In [25]:
words = unique_authors_df["full_name"]
#print(words)
words = [re.sub('[^\-0-9a-zA-Z]+', ' ', word).lower() for word in words]
words = [word.strip() for word in words]
unique_authors_df["cleaned"] = words
unique_authors_df = unique_authors_df.loc[unique_authors_df["cleaned"].str.len()>1]
unique_authors_df.drop_duplicates(subset=['cleaned'], keep='last',inplace=True)
words = list(unique_authors_df["cleaned"])
words = [word for word in words if len(word)>1]


In [26]:
def build_shingles(sentence: str, k: int): # had to adjust for shorter names
    shingles = []
    if len(sentence) <= k:
        shingles.append(sentence)
    else:
        for i in range(len(sentence) - k):
            shingles.append(sentence[i:i+k])
        
    return set(shingles)

def build_vocab(shingle_sets: list):
    # convert list of shingle sets into single set
    full_set = {item for set_ in shingle_sets for item in set_}
    vocab = {}
    for i, shingle in enumerate(list(full_set)):
        vocab[shingle] = i
    return vocab

def one_hot(shingles: set, vocab: dict):
    vec = np.zeros(len(vocab))
    for shingle in shingles:
        idx = vocab[shingle]
        vec[idx] = 1
    return vec

In [27]:
k = 3 # shingle size
# build shingles
shingles = []
for word in words:
    shingles.append(build_shingles(word,k))

# build vocab
vocab = build_vocab(shingles)

# one-hot encode our shingles
shingles_1hot = []
for shingle_set in shingles:
    shingles_1hot.append(one_hot(shingle_set,vocab))

shingles_1hot = np.stack(shingles_1hot)
shingles_1hot.shape

(14250, 7310)

In [28]:
shingles_1hot[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
sum(shingles_1hot[1])

14.0

In [30]:
def minhash_arr(vocab: dict, resolution: int):
    length = len(vocab.keys())
    arr = np.zeros((resolution, length))
    for i in range(resolution):
        permutation = np.random.permutation(len(vocab)) + 1
        arr[i, :] = permutation.copy()
    return arr.astype(int)

def get_signature(minhash, vector):
    # get index locations of every 1 value in vector
    idx = list(np.nonzero(vector))
    idx = idx[0]
    # use index locations to pull only +ve positions in minhash
    shingles = minhash[:, idx]
    # find minimum value in each hash vector
    signature = np.min(shingles, axis=1)
    return signature

In [31]:
print(words[844], shingles[844])

s khokhlachev {'kho', 'okh', 'khl', 'che', 'lac', ' kh', 'hla', 'ach', 'hok', 's k'}


In [32]:
print(words[1], shingles[1])

dietrich b odeker {'ch ', 'die', 'iet', 'b o', 'tri', 'etr', ' b ', 'dek', 'eke', 'ric', 'ich', ' od', 'ode', 'h b'}


In [33]:
arr = minhash_arr(vocab, 40)

signatures = []
index = 0

for vector in shingles_1hot:
    try:
        signatures.append(get_signature(arr, vector))
    except:
        print(vector, index) # put here to find the issue of empty vectors --> was caused by names >= than k-len
    index += 1


# merge signatures into single array
signatures = np.stack(signatures)
signatures.shape

(14250, 40)

In [34]:
signatures[0]

array([  12, 1551,   72,   72,  637, 1172,  320,  871,  646,  149,  884,
        173,  378,  310,  610,  563,   18,    7,   52,  212,   71,  247,
        102,   64,  696,   46,  298,  646,  178, 1508,  291,  105,   56,
        440,  382,  213, 1837,  438,  405,  264])

In [35]:
from itertools import combinations

class LSH:
    buckets = []
    counter = 0
    def __init__(self, b):
        self.b = b
        for i in range(b):
            self.buckets.append({})

    def make_subvecs(self, signature):
        l = len(signature)
        assert l % self.b == 0
        r = int(l / self.b)
        # break signature into subvectors
        subvecs = []
        for i in range(0, l, r):
            subvecs.append(signature[i:i+r])
        return np.stack(subvecs)

    def add_hash(self, signature):
        subvecs = self.make_subvecs(signature).astype(str)
        for i, subvec in enumerate(subvecs):
            subvec = ','.join(subvec)
            if subvec not in self.buckets[i].keys():
                self.buckets[i][subvec] = []
            self.buckets[i][subvec].append(self.counter)
        self.counter += 1

    def check_candidates(self):
        candidates = []
        for bucket_band in self.buckets:
            keys = bucket_band.keys()
            for bucket in keys:
                hits = bucket_band[bucket]
                if len(hits) > 1:
                    candidates.extend(combinations(hits, 2))
        return set(candidates)

In [36]:
b = 20

lsh = LSH(b)

for signature in signatures:
    lsh.add_hash(signature)

In [37]:
candidate_pairs = lsh.check_candidates()
len(candidate_pairs)

443751

In [38]:
list(candidate_pairs)[:5]

[(2538, 2974), (11168, 13088), (9723, 12853), (6228, 12350), (3358, 10461)]

In [39]:
len(candidate_pairs)

443751

### Calculating Scores

In [40]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [41]:
%%time
# TO DO KEEP INDEXES

# create pairs list converting hashed to names
candidate_pairs_lsh = candidate_pairs
candidate_pairs_names = []

for index, tuple in enumerate(candidate_pairs_lsh):
    word_index_1 = int(tuple[0])
    word_index_2 = int(tuple[1])
    candidate_pairs_names.append((words[word_index_1],words[word_index_2]))

CPU times: user 622 ms, sys: 22.3 ms, total: 644 ms
Wall time: 651 ms


In [42]:
%%time
edit_distances = []


# get edit distances score
for index, tuple in enumerate(candidate_pairs_names):
    edit_distances.append(nltk.edit_distance(tuple[0], tuple[1]))

CPU times: user 3min 4s, sys: 1.28 s, total: 3min 6s
Wall time: 3min 11s


In [43]:
# merge with candidate pairs dataframe
cpn_df = pd.concat((pd.DataFrame(candidate_pairs_names,columns=["candidate_1","candidate_2"]),pd.DataFrame(edit_distances, columns=["edit_dist"])),axis=1)
cpn_df

Unnamed: 0,candidate_1,candidate_2,edit_dist
0,khazret nirov,margaret e wessling,15
1,queen mary westfield,n dorey washington,15
2,k imilkowska,jan sladkowski,8
3,j d laenge,a d odintsov,8
4,stefan antusch,brian j h,10
...,...,...,...
443746,mikhail i dobroliubov,m k gaillard,16
443747,shihao chen,takashi mishima,12
443748,andrei marshakov,v c de andrade,13
443749,r garavuso,alberto garcia,11


In [44]:
# some checks
cpn_df= cpn_df[cpn_df["edit_dist"]>0]
cpn_df.sort_values(by=["edit_dist"],ascending=True)[:30]

Unnamed: 0,candidate_1,candidate_2,edit_dist
151750,y kimura,t kimura,1
360100,r h rietdijk,rh rietdijk,1
160577,p kosi nski,p kosinski,1
100885,matthias braendle,matthias brandle,1
252761,s deger,s deser,1
249994,s j rey,s -j rey,1
221584,g furlan,p furlan,1
93390,h kr o ger,h kr oger,1
43292,c lee,w lee,1
19920,t h ubsch,t hubsch,1


In [45]:
np.where(cpn_df["candidate_1"].str.strip() == cpn_df["candidate_2"].str.strip())

(array([], dtype=int64),)

In [46]:
cpn_df[cpn_df["edit_dist"]==1]

Unnamed: 0,candidate_1,candidate_2,edit_dist
390,a p veselov,a i veselov,1
2791,s-y pi,s -y pi,1
3595,h c lee,h w lee,1
4091,j meyer,h meyer,1
4272,e hernandez,r hernandez,1
...,...,...,...
441031,g nagao,t nagao,1
441241,a takahashi,h takahashi,1
441268,ulf lindstr om,ulf lindstr o m,1
441906,m z iofa nuclear physics institute,m z iofa nuclear physics intitute,1


In [47]:
fuzz.token_set_ratio("tom tom howard","tom howard")

100

In [48]:
fuzz.token_sort_ratio("tom tom howard","tom howard")

83

In [49]:
fuzz.token_set_ratio("o lebedev","d lebedev")

89

In [50]:
%%time
fuzzy_score = []

# calculate fuzzy scores, facilitating different order in sequence
for index, tuple in enumerate(candidate_pairs_names):
    fuzzy_score.append((fuzz.token_sort_ratio(tuple[0], tuple[1])+fuzz.token_set_ratio(tuple[0], tuple[1]))/2)

CPU times: user 19.7 s, sys: 182 ms, total: 19.9 s
Wall time: 20.7 s


In [51]:
cpn_df = pd.concat((cpn_df,pd.DataFrame(fuzzy_score, columns=["fuzzy_score"])),axis=1)

In [52]:
cpn_df.sort_values(by=["fuzzy_score"], ascending=False)[cpn_df["fuzzy_score"]!=100]

  """Entry point for launching an IPython kernel.


Unnamed: 0,candidate_1,candidate_2,edit_dist,fuzzy_score
441906,m z iofa nuclear physics institute,m z iofa nuclear physics intitute,1,99.0
61177,st a ephane ouvry division de physique th eorique,st ephane ouvry division de physique th eorique,2,99.0
359242,sergei m kuzenko institut fuer theoretische ph...,sergei kuzenko institut fuer theoretische physik,2,99.0
422225,norma manko v c bor v s tnik,norma manko v c bor v stnik,1,98.0
397049,m semenov-tian-shansky,m a semenov-tian-shansky,2,98.0
...,...,...,...,...
388590,m khlopov,adam krawiec,9,10.0
375064,t kopf,albrecht klemm,11,10.0
269746,d a owen,yutaka ookouchi,12,9.0
76918,l h ryder,subhash rajpoot,12,8.0


In [53]:
# get indexes
cpn = cpn_df.merge(unique_authors_df, left_on='candidate_1',right_on='cleaned')[["candidate_1","candidate_2","edit_dist","fuzzy_score","index_nr"]]
cpn.rename(columns={"index_nr":"index_nr_c1"},inplace=True)
cpn = cpn.merge(unique_authors_df, left_on='candidate_2',right_on='cleaned')[["candidate_1","candidate_2","edit_dist","index_nr_c1","fuzzy_score","index_nr"]]
cpn.rename(columns={"index_nr":"index_nr_c2"},inplace=True)

In [54]:
cpn['first_letter'] = np.where(cpn["candidate_1"].astype(str).str[0] == cpn["candidate_2"].astype(str).str[0], True, False)

In [55]:
%%time
# Function to find out if first letters of comparable names are within same sounding character range
first_letter_jy = []
first_letter_ck = []

for index, row in cpn.iterrows():
    if row["candidate_1"][0] in ["j","y"] and row["candidate_2"][0] in ["j","y"]:
        first_letter_jy.append(True)
        first_letter_ck.append(False)
    
    elif row["candidate_1"][0] in ["c","k"] and row["candidate_2"][0] in ["c","k"]:
        first_letter_jy.append(False)
        first_letter_ck.append(True)
        
    else:
        first_letter_jy.append(False)
        first_letter_ck.append(False)

cpn['first_letter_jy'] = first_letter_jy
cpn['first_letter_ck'] = first_letter_ck

CPU times: user 29.7 s, sys: 290 ms, total: 30 s
Wall time: 30.6 s


In [56]:
cpn.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

In [57]:
cpn.sort_values(by="fuzzy_score",ascending=False)[cpn["fuzzy_score"]==97]

  """Entry point for launching an IPython kernel.


Unnamed: 0,candidate_1,candidate_2,edit_dist,index_nr_c1,fuzzy_score,index_nr_c2,first_letter,first_letter_jy,first_letter_ck
441734,jerome gauntlett,jerome p gauntlett,2,8047,97.0,15711,True,True,False
351480,a aghamohammdi,a aghamohammadi,1,3382,97.0,24880,True,False,False
144319,thomas schuecker,thomas schucker,1,5700,97.0,21538,True,False,False
232494,max-planck-institut,max-planck-institute,1,16592,97.0,25364,True,False,False
379594,paul steinhardt,paul j steinhardt,2,6148,97.0,25148,True,False,False
...,...,...,...,...,...,...,...,...,...
393054,pierre bin etruy,pierre binetruy,1,21677,97.0,22058,True,False,False
396927,yoichro matsumura,yoichiro matsumura,1,6469,97.0,18272,True,True,False
419951,a polychronakos,a p polychronakos,2,10959,97.0,23294,True,False,False
46818,daniel freedman,daniel z freedman,2,1306,97.0,21618,True,False,False


In [58]:
cpn[cpn["first_letter_jy"] == True]

Unnamed: 0,candidate_1,candidate_2,edit_dist,index_nr_c1,fuzzy_score,index_nr_c2,first_letter,first_letter_jy,first_letter_ck
124,john w barrett,j audretsch,11,6938,48.0,14304,True,True,False
126,j arafune,j audretsch,7,2603,50.0,14304,True,True,False
127,j a e carrillo,j audretsch,10,3797,40.0,14304,True,True,False
130,j ambjorn nbi,j audretsch,9,7372,33.0,14304,True,True,False
131,j ambj o rn,j audretsch,8,8722,31.5,14304,True,True,False
...,...,...,...,...,...,...,...,...,...
443555,j froehlich,juerg froehlich,4,5764,87.5,14492,True,True,False
443559,yoav lavi,yoav lederer,6,12386,59.5,16615,True,True,False
443654,jae-kwan kim,joe kiskis,7,1346,55.0,4761,True,True,False
443695,yi-yen wu lbnl,yi-yen wu uc berkeley,9,3995,73.5,4484,True,True,False


In [59]:
scores_75 = cpn[(cpn["fuzzy_score"]>=75) & (cpn["first_letter"] == True)]
scores_85 = cpn[(cpn["fuzzy_score"]>=85) & (cpn["first_letter"] == True)]
scores_90 = cpn[(cpn["fuzzy_score"]>=90) & (cpn["first_letter"] == True)]
scores_100 = cpn[(cpn["fuzzy_score"]==100) & (cpn["first_letter"] == True)]

In [60]:
# TRY WITH EDIT DISTANCES
scores_1 = cpn[(cpn["edit_dist"]==1) & (cpn["first_letter"] == True)]
scores_2 = cpn[(cpn["edit_dist"]<=2) & (cpn["first_letter"] == True)]
scores_3 = cpn[(cpn["edit_dist"]<=3) & (cpn["first_letter"] == True)]

### Creating the Network

In [64]:
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

In [61]:
def co_author_sim(unique_authors_df, scores_df, combined):
    # combine dataframes
    df = pd.DataFrame()
    df["candidate_1"] = scores_df["candidate_1"]
    df["candidate_2"] = scores_df["candidate_2"]
    df["index_nr_c1"] = scores_df["index_nr_c1"]
    df["index_nr_c2"] = scores_df["index_nr_c2"]
    df["edit_dist"] = scores_df["edit_dist"]
    df["fuzzy_score"] = scores_df["fuzzy_score"]
    
    df = df.merge(unique_authors_df[["affiliated_papers","index_nr"]], left_on = "index_nr_c1",right_on = "index_nr")
    df = df.merge(unique_authors_df[["affiliated_papers","index_nr"]], left_on = "index_nr_c2",right_on = "index_nr")
    df.rename(columns={"affiliated_papers_x": "affiliated_papers_c1", "affiliated_papers_y": "affiliated_papers_c2"},inplace=True)
    df = df[["candidate_2","candidate_1","index_nr_c1","index_nr_c2","affiliated_papers_c1","affiliated_papers_c2","edit_dist","fuzzy_score"]]
    
    # find intersecting papers for each candidate pairs
    df.dropna(inplace=True)
    intersect_papers = []
    
    for index, row in df.iterrows():
        intersect_papers.append(list(set(row["affiliated_papers_c1"]) & set(row["affiliated_papers_c2"])))
    
    df["intersect_papers"] = intersect_papers
    
    # calculate intersection ratio
    df['union_papers'] = df[['affiliated_papers_c1', 'affiliated_papers_c2']].values.tolist()
    df["union_papers"] = df["union_papers"].apply(lambda x: list(set(flatten_list(x))))
    paper_ratio = []
    for index, row in df.iterrows():
        try:
            paper_ratio.append(len(row["intersect_papers"])/len(row["union_papers"]))
        except:
            paper_ratio.append(0)
        
    df["paper_ratio"] = paper_ratio
    
    # get co-authors
    co_authors_c1 = []
    co_authors_c2 = []
    
    for index, row in df.iterrows():
        co_authors_c1.append(combined[lambda df: df["paper"].isin(row["affiliated_papers_c1"])]["authors"].tolist())
        co_authors_c2.append(combined[lambda df: df["paper"].isin(row["affiliated_papers_c2"])]["authors"].tolist())
        
    # flatten co_authors and add, get them in universal format first
    df["co_authors_c1"] = co_authors_c1
    df["co_authors_c2"] = co_authors_c2
    df["co_authors_c1"] = df["co_authors_c1"].apply(lambda x: set(flatten_list(x)))
    df["co_authors_c2"] = df["co_authors_c2"].apply(lambda x: set(flatten_list(x)))
    df["co_authors_c1"] = df["co_authors_c1"].apply(lambda x: list(name(elem) for elem in x))
    df["co_authors_c2"] = df["co_authors_c2"].apply(lambda x: list(name(elem) for elem in x))
    
    
    # find intersecting co_authors for each candidate pair
    df.dropna(inplace=True)
    intersect_authors = []
    
    for index, row in df.iterrows():
        intersect_authors.append(list(set(row["co_authors_c1"]) & (set(row["co_authors_c2"]))))
    
    df["intersect_authors"] = intersect_authors
    
    # calculate intersection ratio
    df['union_authors'] = df[['co_authors_c1', 'co_authors_c2']].values.tolist()
    df["union_authors"] = df["union_authors"].apply(lambda x: list(set(flatten_list(x))))
    
    author_ratio = []
    for index, row in df.iterrows():
        try:
            author_ratio.append(len(row["intersect_authors"])/len(row["union_authors"]))
        except:
            author_ratio.append(0)
        
    df["author_ratio"] = author_ratio
    
    # get correlations
    corr_authors = df[["author_ratio","fuzzy_score"]].corr()
    corr_papers = df[["paper_ratio","fuzzy_score"]].corr()
    
    return(df.sort_values(by="author_ratio",ascending=False),corr_authors,corr_papers)

In [62]:
scores_100

Unnamed: 0,candidate_1,candidate_2,edit_dist,index_nr_c1,fuzzy_score,index_nr_c2,first_letter,first_letter_jy,first_letter_ck
8961,h j w mueller--kirsten,h j w mueller-kirsten,1,8454,100.0,15764,True,False,False
11917,juan perez--mercader laeff,juan perez-mercader laeff,1,7898,100.0,18205,True,True,False
36991,m b silva-neto,m b silva neto,1,15161,100.0,19563,True,False,False
66192,dieter lust humboldt university,dieter lust humboldt-university,1,2262,100.0,25405,True,False,False
68509,st-petersburg,st petersburg,1,13456,100.0,22019,True,False,False
...,...,...,...,...,...,...,...,...,...
442121,a gonz alez--ruiz,a gonz alez-ruiz,1,5686,100.0,14130,True,False,False
442279,y h quano,y -h quano,1,9304,100.0,18827,True,True,False
443072,c-m viallet,c -m viallet,1,6367,100.0,8825,True,False,True
443204,j c yera,j -c yera,1,3927,100.0,19078,True,True,False


In [65]:
# Apply checks
scores_100 = co_author_sim(unique_authors_df, scores_100, combined_df)[0]
scores_100 = scores_100[(scores_100["paper_ratio"]!=1)]

scores_90 = co_author_sim(unique_authors_df, scores_90, combined_df)[0]
scores_90 = scores_90[(scores_90["paper_ratio"]!=1)]

scores_85 = co_author_sim(unique_authors_df, scores_85, combined_df)[0]
scores_85 = scores_85[(scores_85["paper_ratio"]!=1)]

scores_75 = co_author_sim(unique_authors_df, scores_75, combined_df)[0]
scores_75 = scores_75[(scores_75["paper_ratio"]!=1)]

In [118]:
# Using FuzzyWuzzy Scores
scores_75_network = nx.Graph()
scores_90_network = nx.Graph()
scores_85_network = nx.Graph()
scores_100_network = nx.Graph()

# scores_75_network.add_nodes_from(list(zip(list(unique_authors_df["index_nr"]),list(unique_authors_df["cleaned"]))))
# scores_90_network.add_nodes_from(list(zip(list(unique_authors_df["index_nr"]),list(unique_authors_df["cleaned"]))))
# scores_85_network.add_nodes_from(list(zip(list(unique_authors_df["index_nr"]),list(unique_authors_df["cleaned"]))))
# scores_100_network.add_nodes_from(list(zip(list(unique_authors_df["index_nr"]),list(unique_authors_df["cleaned"]))))

scores_75_network.add_nodes_from(list(unique_authors_df["index_nr"]))
scores_90_network.add_nodes_from(list(unique_authors_df["index_nr"]))
scores_85_network.add_nodes_from(list(unique_authors_df["index_nr"]))
scores_100_network.add_nodes_from(list(unique_authors_df["index_nr"]))


edges_75 = list(zip(scores_75["index_nr_c1"], scores_75["index_nr_c2"]))
edges_90 = list(zip(scores_90["index_nr_c1"], scores_90["index_nr_c2"]))
edges_85 = list(zip(scores_85["index_nr_c1"], scores_85["index_nr_c2"]))
edges_100 = list(zip(scores_100["index_nr_c1"], scores_100["index_nr_c2"]))


scores_75_network.add_edges_from(edges_75)
scores_90_network.add_edges_from(edges_90)
scores_85_network.add_edges_from(edges_85)
scores_100_network.add_edges_from(edges_100)

In [119]:
# Use clean_authors function on this dataset, should have done that way before defining unique_authors but here we are
combined_df["authors"] = combined_df["authors"].apply(lambda x: clean_authors(x))

In [120]:
print(nx.transitivity(scores_100_network),nx.transitivity(scores_90_network), nx.transitivity(scores_85_network), nx.transitivity(scores_75_network))

1.0 0.5512048192771084 0.5201834862385321 0.5279144121987212


In [121]:
print(scores_100_network.number_of_edges(), scores_90_network.number_of_edges(), scores_85_network.number_of_edges(), scores_75_network.number_of_edges())

66 1188 2324 5582


In [122]:
nx.write_edgelist(scores_100_network,"step1_100.csv", delimiter=',')
nx.write_edgelist(scores_90_network,"step1_90.csv", delimiter=',')
nx.write_edgelist(scores_85_network,"step1_85.csv", delimiter=',')
nx.write_edgelist(scores_75_network,"step1_75.csv", delimiter=',')

In [71]:
print(scores_100_network.number_of_nodes(), scores_90_network.number_of_nodes(), scores_85_network.number_of_nodes(), scores_75_network.number_of_nodes())

14250 14250 14250 14250


### Analysis of co-authors and shared papers

In [72]:
# Function to find pairs that are unclosed!! 

from itertools import combinations

# Define node_in_open_triangle()
def node_in_open_triangle(G, n):
    """
    Checks whether pairs of neighbors of node `n` in graph `G` are in an 'open triangle' relationship with node `n`.
    """
    in_open_triangle = False
    open_triangle = []

    # Iterate over all possible triangle relationship combinations
    for n1, n2 in combinations(G.neighbors(n), 2):

        # Check if n1 and n2 do NOT have an edge between them
        if not G.has_edge(n1, n2):

            in_open_triangle = True
            open_triangle.append((n1,n2))
            
            break

    return in_open_triangle,open_triangle

In [93]:
 def sim_unclosed_triangles(unique_authors_df, unclosed_triangles, combined):
    df = unclosed_triangles
    df = df.merge(unique_authors_df[["affiliated_papers","index_nr","cleaned"]], left_on = "index_nr_c1",right_on = "index_nr")
    df = df.merge(unique_authors_df[["affiliated_papers","index_nr","cleaned"]], left_on = "index_nr_c2",right_on = "index_nr")
    df.rename(columns={"affiliated_papers_x": "affiliated_papers_c1", "affiliated_papers_y": "affiliated_papers_c2", "cleaned_x":"name_c1","cleaned_y":"name_c2"},inplace=True)
    df = df[["index_nr_c1","index_nr_c2","affiliated_papers_c1","affiliated_papers_c2","name_c1","name_c2"]]
    
    # find intersecting papers for each candidate pairs
    df.dropna(inplace=True)
    intersect_papers = []
    
    for index, row in df.iterrows():
        intersect_papers.append(list(set(row["affiliated_papers_c1"]) & set(row["affiliated_papers_c2"])))
    
    df["intersect_papers"] = intersect_papers
    
    # calculate intersection ratio
    df['union_papers'] = df[['affiliated_papers_c1', 'affiliated_papers_c2']].values.tolist()
    df["union_papers"] = df["union_papers"].apply(lambda x: list(set(flatten_list(x))))
    paper_ratio = []
    for index, row in df.iterrows():
        try:
            paper_ratio.append(len(row["intersect_papers"])/len(row["union_papers"]))
        except:
            paper_ratio.append(0)
        
    df["paper_ratio"] = paper_ratio
    
    # get co-authors
    co_authors_c1 = []
    co_authors_c2 = []
    
    for index, row in df.iterrows():
        co_authors_c1.append(combined[lambda df: df["paper"].isin(row["affiliated_papers_c1"])]["authors"].tolist())
        co_authors_c2.append(combined[lambda df: df["paper"].isin(row["affiliated_papers_c2"])]["authors"].tolist())
    
    
    # flatten co_authors and add, get them in universal format first
    df["co_authors_c1"] = co_authors_c1
    df["co_authors_c2"] = co_authors_c2
    df["co_authors_c1"] = df["co_authors_c1"].apply(lambda x: set(flatten_list(x)))
    df["co_authors_c2"] = df["co_authors_c2"].apply(lambda x: set(flatten_list(x)))
    df["co_authors_c1"] = df["co_authors_c1"].apply(lambda x: list(name(elem) for elem in x))
    df["co_authors_c2"] = df["co_authors_c2"].apply(lambda x: list(name(elem) for elem in x))
    
    
    # find intersecting co_authors for each candidate pair
    df.dropna(inplace=True)
    intersect_authors = []
    
    for index, row in df.iterrows():
        intersect_authors.append(list(set(row["co_authors_c1"]) & (set(row["co_authors_c2"]))))
    
    df["intersect_authors"] = intersect_authors
    
    # calculate intersection ratio
    df['union_authors'] = df[['co_authors_c1', 'co_authors_c2']].values.tolist()
    df["union_authors"] = df["union_authors"].apply(lambda x: list(set(flatten_list(x))))
    
    author_ratio = []
    for index, row in df.iterrows():
        try:
            author_ratio.append(len(row["intersect_authors"])/len(row["union_authors"]))
        except:
            author_ratio.append(0)
        
    df["author_ratio"] = author_ratio

    return df

In [75]:
# Compute the number of open triangles in scores_100
num_open_triangles = 0

# Iterate over all the nodes in T
for n in scores_100_network.nodes():

    # Check if the current node is in an open triangle
    if node_in_open_triangle(scores_100_network, n)[0]:

        # Increment num_open_triangles
        num_open_triangles += 1

print(num_open_triangles)

0


In [88]:
# Find all the unclosed triangles
def find_unclosed(scores_network):
    unclosed_triangles = {}

    for n in scores_network.nodes():
        if node_in_open_triangle(scores_network, n)[1]:
            unclosed_triangles[n] = node_in_open_triangle(scores_network, n)[1]
    return unclosed_triangles

def create_unclosed_df(unclosed_triangles):
    unclosed_triangles = pd.DataFrame(unclosed_triangles).T
    unclosed_triangles.rename(columns={0:"pair"},inplace=True)
    unclosed_triangles[['index_nr_c1', 'index_nr_c2']] = pd.DataFrame(unclosed_triangles['pair'].tolist(), index=unclosed_triangles.index)
    unclosed_triangles.reset_index(inplace=True)
    unclosed_triangles.rename(columns={"index":"bridge_node"},inplace=True)
    return unclosed_triangles

In [77]:
# define new "sameas" edges based on paper and author ratio
def get_new_edges(unclosed_triangles_df):
    new = unclosed_triangles_df[(unclosed_triangles_df["paper_ratio"]== 0.0) & (unclosed_triangles_df["author_ratio"]>= 0.20)]
    new_edges = set(list(zip(new["index_nr_c1"], new["index_nr_c2"])))
    return new_edges

In [89]:
%%time
#unclosed_triangles_100 = create_unclosed_df(find_unclosed(scores_100_network))
unclosed_triangles_90 = create_unclosed_df(find_unclosed(scores_90_network))
unclosed_triangles_85 =  create_unclosed_df(find_unclosed(scores_85_network))
unclosed_triangles_75 = create_unclosed_df(find_unclosed(scores_75_network))

CPU times: user 224 ms, sys: 8.26 ms, total: 232 ms
Wall time: 276 ms


In [80]:
#0.9375 0.5111111111111111 0.467674661105318 0.5888904166093771

In [91]:
# %%time
# # for the 100 threshold
# edges = get_new_edges(sim_unclosed_triangles(unique_authors_df, unclosed_triangles_100, combined_df))
# scores_100_network.add_edges_from(edges)
# nx.transitivity(scores_100_network)

In [94]:
%%time
# for the 90 threshold
edges = get_new_edges(sim_unclosed_triangles(unique_authors_df, unclosed_triangles_90, combined_df))
scores_90_network.add_edges_from(edges)
nx.transitivity(scores_90_network)

CPU times: user 668 ms, sys: 43.6 ms, total: 712 ms
Wall time: 804 ms


0.6147540983606558

In [95]:
%%time
# for the 85 threshold
edges = get_new_edges(sim_unclosed_triangles(unique_authors_df, unclosed_triangles_85, combined_df))
scores_85_network.add_edges_from(edges)
nx.transitivity(scores_85_network)

CPU times: user 1.91 s, sys: 29.1 ms, total: 1.94 s
Wall time: 2.01 s


0.6320830007980845

In [96]:
%%time
# for the 75 threshold
edges = get_new_edges(sim_unclosed_triangles(unique_authors_df, unclosed_triangles_75, combined_df))
scores_75_network.add_edges_from(edges)
nx.transitivity(scores_75_network)

CPU times: user 5.72 s, sys: 80 ms, total: 5.8 s
Wall time: 6.15 s


0.5421513370907782

In [97]:
print(scores_100_network.number_of_edges(), scores_90_network.number_of_edges(), scores_85_network.number_of_edges(), scores_75_network.number_of_edges())

66 1201 2385 5639


In [98]:
print(scores_100_network.number_of_nodes(), scores_90_network.number_of_nodes(), scores_85_network.number_of_nodes(), scores_75_network.number_of_nodes())

14250 14250 14250 14250


In [101]:
print(nx.number_connected_components(scores_100_network), nx.number_connected_components(scores_90_network), nx.number_connected_components(scores_85_network),nx.number_connected_components(scores_75_network),)

14190 13117 12109 9723


In [109]:
# create mapping from ID to author name
mapping = unique_authors_df[["cleaned","index_nr"]].to_dict()

In [113]:
mapping = mapping["cleaned"]

In [115]:
scores_100_names_network = nx.relabel_nodes(scores_100_network, mapping, copy=True)
scores_90_names_network = nx.relabel_nodes(scores_90_network, mapping, copy=True)
scores_85_names_network = nx.relabel_nodes(scores_85_network, mapping, copy=True)
scores_75_names_network = nx.relabel_nodes(scores_75_network, mapping, copy=True)

In [117]:
# nx.write_edgelist(scores_100_names_network,"final_100_names.csv", delimiter=',')
# nx.write_edgelist(scores_90_names_network,"final_90_names.csv", delimiter=',')
# nx.write_edgelist(scores_85_names_network,"final_85_names.csv", delimiter=',')
# nx.write_edgelist(scores_75_names_network,"final_75_names.csv", delimiter=',')

In [102]:
# nx.write_edgelist(scores_100_network,"final_100.csv")
# nx.write_edgelist(scores_90_network,"final_90.csv")
# nx.write_edgelist(scores_85_network,"final_85.csv")
# nx.write_edgelist(scores_75_network,"final_75.csv")

### QUESTIONS W1:
- Include co-authorship? How?
- Include year?
- Use NLP?
- Creating all possible pairs is a big list, problem? --> Efficiency?
  - Remove pairs without same first letter (What if someone is stated by last name only?)
- How to solve weird author format/filter out University names e.g.:
  Authors: J.C. da Silva (1,2), F.C. Khanna (3,4), A. Matos Neto (1), and A.E.
  Santana (1,3) ((1) Instituto de Fisica, Universidade Federal da Bahia, Campus
  de Ondina, Salvador, Bahia, Brasil; (2) Centro Federal de Educacao
  Tecnologica da Bahia, Salvador, Bahia, Brasil; (3) Physics Department,
  Theoretical Physics Institute, University of Alberta, Edmonton, Alberta,
  Canada; (4) TRIUMF, Westbrook mall, Vancouver, British Columbia, Canada)
- Found multiple ER methods:
    - Dedupe (requires some user-labeling)
    - Should I use simple string similarity measures? 

### NOTES/TO DO W1:
- Osiris case: Second examiner is Hakim Qahtan, Vahid is day supervisor
- Create networks using authors on the same publications
- Pair-wise comparison: Levensteihn/edit distance, Hard code with initials/first or second name etc --> (Jon Snow and Jay Snow) = Code my own rule using domain knowledge.
- Soundex (if two names sound alike)
- Possibly try extra libraries and compare with Levensteihn scores. Error analysis on libraries and what can we do to improve? 

==> COMPUTE OVERALL SCORE AND SCORE PAIRS

- Network of similarity of the names (check also transitivity of this network). 
- Clustering/community analysis to determine if triangle of 3 authors are the same in similarity network (A is similar to B and B is similar to C, is A = B = C the same? Check if in same cluster or community)
- If they are NOT triangles, but similar based on scores --> further research

Later on:
- Year matters? Someone married and name is changed/affiliation. You expect that co-author similarity is closer in the same year than in two years that are five years apart. Relevance of co-authorship probably fades over time (take into account).
- Include year (compare to earlier publications, how to keep track of these entities over time, map new occurences to earlier resolved entities): only when the data is very clean

### QUESTIONS W2:
- All possible pairs = impossible to work with! --> make more logical pairs: 

### NOTES/TO DO W2:
- Scalability issue
- Make alphabetic groups and compare pair-wise the authors (a.text, b.text, c.text)
- Look at hashing options? 
- Reduce scalability issue: n-gram indexing (Lucene), 2 or 3 gram, index the authors then query the most similar, then compare those.  

### NOTES/TO DO MEETING 3:
- K shingles on authors and SKIP(on paper abstracts) —> First stick to resolving authors
- Naieve function takes too long —> use LSH to compare big amount of documents
- LSH and then compare using other methods
- Decrease more if fine-tuning parameters/ shingles .. (I want to decrease false negatives)
- Keep the indexes to see if duplicates or within the code
- Find false positive but by similarity of abstracts they are dissimilar --> show this 
- NO LONGER USE LUCENE


- If you weaken the assumption about what SameAs means, the more likely you will get unclosed triangles
- Make sure that it is author names/listed author bc. we have not resolved it into a person yet
- Summary: first do the scoring, quantify inconsistencies (how often unclosed triangles)
- Find sweet spot between weakening enough to find misspellings but not so much that you are bringing too much noise
- We are missing a ground truth, no accuracy is possible
- Function of similarity is now edit distance --> try to extend this/use different matters, then decide on a threshold, moving the threshold creates a denser or less dense graph --> count the unclosed triangles (inverse of transitivity). If you increase threshold, transitivy will go down.
- Transitivity approximation measures or just transitivity
- The lower the transitivity the more inconsistencies you have
- Amount of SameAs edges are controlled with similarities
- Explore solutions of entity resolution based on name similarity --> investigate how many inconsistencies we have using only string similarity --> then you know how hard the problem is/scale of the problem --> find further solutions (e.g. similarity between co-authors).
- First we have similarity based on function (e.g. edit distances), the inconsistencies that still remain (transitivity can get better) can be solved with community similarity/similarity of co-author. (Can also be future work)
- Threshold is also determined by hardware limitations (as soon as the graph is too dense to manage = also in bad threshold territory) + ALSO an optimization problem but we are not iterating to optimize --> just pick a few thresholds, optimization problem in future work
- Make graphs with different thresholds

Literature:
- Entity Resolution
    - General
    - In this context
- Local Sensitive Hashing —> with example of names (output)
- Scores

### NOTES/TO DO MEETING 4:
- How many compononents are there in the network? (subgraphs that you can take out without breaking edges with > 2 nodes). Gives an impression of how big the problem is. How big are these components? If the components covers a lot of entities, threshold is not enough. LOOK AT THE EXAMPLES, EXTRACT EXAMPLES, SEARCH FOR CASES
- Mention authors are a set, they are not occurences --> is it a good thing? Try to detect individuals that have the same name = corner case --> limitation because that is not dealt with. Brings an error but is alright. --> If you take more fields instead of just physics this has to be dealt with. 
- Report for each threshold the transitivity and component and show an image. 
- In the ideal situation, the transitivity is 1
- Make a new scoring system for the similarity of co-authors and add edges if they are similar --> goal to increase transitivity
    - For each of the authors add a set of co-authors (all the papers where the author occurs), you then compare the co-authors for the authors you want to compare and decide upon a threshold for that --> add an edge if they are similar. Explain that they are not as strong as the string name. 
    - For each pair of nodes we put the string similarity and author similarity and get then get the correlation to determine the reliability of co-author similarity --> this dataset does not have affiliations therefore this is a way aka no 100% solution possible
    - I am taking the step to find the solution and to determine the scale of the problem
- Clean the names before I put them in a set --> mention that cleaning removes noise and setting assumes that they are the same person without the noise. + USE STRIP()
- describe REALLY well what my measure is doing 
- Check my indexes! 
- Output a lot of things, components and ANALYZE them

### DONE
- Added a check whether first letter is the same, as the chance that someone misspells the first letter seems negligable (a jansen and c jansen are probably not the same person) --> Discuss in limitations, however Yuliya en Julia are common misspellings including the first letter.. --> discuss
- strip in the beginning
- Strings that are the same after cleaning are assumed to be pointing towards the same entity

### NOTES
- Make graphs for pair candidates and compare co-authors