In [103]:
import pandas as pd
import numpy
import re
import spacy

# Loading the datasets:  combined publications, category definitions, area definitions, individual publications

In [104]:
df1 = pd.read_csv("loadable_profiles_final_all_delta.csv", header=0,
                    delimiter=",",encoding = "ISO-8859-1")                             # dataframe of the combined publications                              
df1 = df1.rename(columns={'Biography':'Description_Title'})



df2 = pd.read_csv("CEC_category_definitions_explode.csv", header=0,
                    delimiter=",",encoding = "ISO-8859-1")                                                     # dataframe of the categories                   

df2 = df2.drop_duplicates()
df2  =df2.reset_index()
df2= df2.reset_index()
df2.rename(columns = {'level_0':'ID'}, inplace = True)
df2 = df2[['ID','Category','def']]


df3 = pd.read_csv("CEC_area_definitions.csv", header=0,
                    delimiter=",",encoding = "ISO-8859-1")                                     # dataframe of the areas           


df3 = df3.reset_index()
df3.rename(columns = {'index':'ID'}, inplace = True)


df4 = pd.read_csv("loadable_profiles_final_all_delta.csv", header=0,
                    delimiter=",",encoding = "ISO-8859-1")                                              
df4 = df4[['ID','Name','Biography']].rename(columns={'Biography':"descriptions"})                                           # dataframe of the individual publications


# View first 5 rows of dataframes

In [105]:
# df1=df1[:][['ID','Name','Description_Title']][~df1['Description_Title'].isna()]
df1[:]

Unnamed: 0.1,Unnamed: 0,ID,Name,Title,Phone Number,Description_Title,Research Interest,Degrees,Profile Link,Email,Personal Link
0,241,0,giacomo francisci,postdoctoral research fellow,,,,,,,
1,242,1,david kepplinger,assistant professor,,david is an assistant professor of statistics...,"robust statistics for high-dimensional data, r...","phd, statistics, university of british columbi...",https://statistics.gmu.edu/node/301,dkepplin@gmu.edu,https://www.dkepplinger.org\n
2,243,2,ilhan izmirli,associate professor\nÂ,703-993-5168,ilhan izmirli grew up surrounded by books and...,,"phd, history of mathematics, american univers...",https://statistics.gmu.edu/profiles/iizmirl2,iizmirl2@gmu.edu,http://mason.gmu.edu/~iizmirl2/\n
3,244,3,jason goldberg,postdoctoral research fellow,,,,,,,
4,245,4,kenneth pasiah,assistant professor,,kenneth pasiahâs research interests include...,,generation of random numbers\napplied statisti...,https://statistics.gmu.edu/node/711,kpasiah@gmu.edu,
...,...,...,...,...,...,...,...,...,...,...,...
93,334,93,stone bond,sr research program manager,,,,,,,
94,335,94,susan schleigh,senior research engineer,,,,,,,
95,336,95,traci johnson,research manager,,,,,,,
96,337,96,joseph kobsar,sr research engineer,,,,,,,


In [106]:
df2[:5]  #29 Nan values in categories
# df2[:][~df2['def'].isna()]

Unnamed: 0,ID,Category,def
0,0,cognitive systems,relating involving cognition instrumentality c...
1,1,intelligent agents,capacity thought reason especially high degree...
2,2,knowledge representation reasoning,psychological result perception learning reaso...
3,3,machine learning,mechanical electrical device transmits modifie...
4,4,natural language processing,someone regarded certain succeed notation canc...


In [107]:
df3[:5]  #3 Nan values in areas

Unnamed: 0,ID,area,def
0,0,artificial intelligence,contrived art rather nature artificially forma...
1,1,autonomous systems,of political bodies existing independent entit...
2,2,computer engineering,machine performing calculations automatically ...
3,3,computer networks communications,machine performing calculations automatically ...
4,4,cybersecurity,


In [108]:
# df4=df4[:][~df4['descriptions'].isna()] #with bio
df4[:5]

Unnamed: 0,ID,Name,descriptions
0,0,giacomo francisci,
1,1,david kepplinger,david is an assistant professor of statistics...
2,2,ilhan izmirli,ilhan izmirli grew up surrounded by books and...
3,3,jason goldberg,
4,4,kenneth pasiah,kenneth pasiahâs research interests include...


In [109]:
nlp = spacy.load('en_core_web_sm')      #loading the english dictionary from spacy package 

# Defining a helper function for stemming and lemmatizing the text 

In [110]:
def stem(text):                                                                        
	x = []
	doc = nlp(str(text))

	for word in doc:
		x.append(word.lemma_)
	return " ".join(x)

In [111]:
df1['Description_Title']= df1['Description_Title'].apply(stem)          # applying helper function to the dataframes

df2['def']= df2['def'].apply(stem)

df3['def']= df3['def'].apply(stem)

df4['descriptions'] = df4['descriptions'].apply(stem)

In [112]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()                                                 # initializing the count vectorizer 

# Vectorizing using combined publications dataframe

In [113]:
fit = cv.fit(df1['Description_Title'])                                                  


author_vectors=fit.transform(df1['Description_Title']).toarray()

category_vectors = fit.transform(df2['def']).toarray()

area_Vectors = fit.transform(df3['def']).toarray()

# Vectorizing using individual publications dataframe


In [114]:
pub_fit = cv.fit(df4['descriptions'])                                                         
pub_author_vectors=pub_fit.transform(df4['descriptions']).toarray()

pub_category_vectors = pub_fit.transform(df2['def']).toarray()

pub_area_Vectors = pub_fit.transform(df3['def']).toarray()


# Helper function to calculate the cosine similarity

In [115]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim_vectors(vec1, vec2):                                                   
    vec1 = vec1.reshape(1,-1)
    vec2 = vec2.reshape(1,-1)

    return cosine_similarity(vec1,vec2)[0][0]

# Looping through the combined authors vectors and calculating cosine similarity with each category and creating a list of scores

In [116]:
cosine_sim_list_cat = []                    
 
for i in range(len(author_vectors)):
    dummy = []
    for j in range(len(category_vectors)):
        similarity = cosine_sim_vectors(author_vectors[i],category_vectors[j])
        dummy.append(similarity)
    cosine_sim_list_cat.append(dummy)
    

# Looping through the combined authors vectors and calculating cosine similarity with each area and creating a list of scores

In [117]:
cosine_sim_list_area = []

for i in range(len(author_vectors)):                                        
    dummy = []
    for j in range(len(area_Vectors)):
        similarity = cosine_sim_vectors(author_vectors[i],area_Vectors[j])
        dummy.append(similarity)
    cosine_sim_list_area.append(dummy)
    

# Looping through the pub_authors vectors and calculating cosine similarity with each category and creating a list of scores

In [118]:
pub_cosine_sim_list_cat = []

for i in range(len(pub_author_vectors)):                       
    dummy = []
    for j in range(len(pub_category_vectors)):
        similarity = cosine_sim_vectors(pub_author_vectors[i],pub_category_vectors[j])
        dummy.append(similarity)
    pub_cosine_sim_list_cat.append(dummy)
    

# Looping through the pub_authors vectors and calculating cosine similarity with each area and creating a list of scores

In [119]:
pub_cosine_sim_list_area = []

for i in range(len(pub_author_vectors)):
    dummy = []
    for j in range(len(pub_area_Vectors)):
        similarity = cosine_sim_vectors(pub_author_vectors[i],pub_area_Vectors[j])
        dummy.append(similarity)
    pub_cosine_sim_list_area.append(dummy)
    


# Looping through the category vectors and calculating cosine similarity with each author and creating a list of scores

In [120]:
person_cosine_sim_list = []

for i in range(len(category_vectors )):                       
    dummy = []
    for j in range(len(author_vectors)):
        similarity = cosine_sim_vectors(category_vectors [i],author_vectors[j])
        dummy.append(similarity)
    person_cosine_sim_list.append(dummy)


# Looping through the category vectors and calculating cosine similarity with each author and creating a list of scores

In [121]:
area_person_cosine_sim_list = []

for i in range(len(area_Vectors)):                       
    dummy = []
    for j in range(len(author_vectors)):
        similarity = cosine_sim_vectors(area_Vectors [i],author_vectors[j])
        dummy.append(similarity)
    area_person_cosine_sim_list.append(dummy)

# Looping through the cosine sim list and taking the top 10 category names and scores

In [122]:
top10_cat = []                                   
score_cat = []
for i in range(0,len(cosine_sim_list_cat)):                             
    top10 = sorted(list(enumerate(cosine_sim_list_cat[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        category_name = df2['Category'][a]
        dummy.append(category_name)
        score_dummy.append(b)
    top10_cat.append(dummy)
    score_cat.append(score_dummy)

# Looping through the cosine sim list and taking the top 10 area names and scores

In [123]:
top10_area = []            
score_area = []
for i in range(0,len(cosine_sim_list_area)):                                  
    top10 = sorted(list(enumerate(cosine_sim_list_area[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        area_name = df3['area'][a]
        dummy.append(area_name)
        score_dummy.append(b)
    top10_area.append(dummy)
    score_area.append(score_dummy)

# Looping through the cosine sim list and taking the top 10 category names for each individual publication

In [124]:
pub_top10_cat = []
pub_score_cat = []
for i in range(0,len(pub_cosine_sim_list_cat)): 
    top10 = sorted(list(enumerate(pub_cosine_sim_list_cat[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        category_name = df2['Category'][a]
        dummy.append(category_name)
        score_dummy.append(b)
    pub_top10_cat.append(dummy)
    pub_score_cat.append(score_dummy)

# Looping through the cosine sim list and taking the top 10 area names for each individual publication

In [125]:
pub_top10_area = []
pub_score_area = []
for i in range(0,len(pub_cosine_sim_list_area)):
    top10 = sorted(list(enumerate(pub_cosine_sim_list_area[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        area_name = df3['area'][a]
        dummy.append(area_name)
        score_dummy.append(b)
    pub_top10_area.append(dummy)
    pub_score_area.append(score_dummy)

# Looping through the cosine sim list and taking the top 10 author names for each category

In [126]:
pub_top10_person = []
pub_score_person = []
for i in range(0,len(person_cosine_sim_list)):
    top10 = sorted(list(enumerate(person_cosine_sim_list[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        name = df1['Name'][a]
        dummy.append(name)
        score_dummy.append(b)
    pub_top10_person.append(dummy)
    pub_score_person.append(score_dummy)

# Looping through the cosine sim list and taking the top 10 author names for each area

In [127]:
top10_person_area = []
score_person_area = []
for i in range(0,len(area_person_cosine_sim_list)):
    top10 = sorted(list(enumerate(area_person_cosine_sim_list[i])),reverse = True, key = lambda x:x[1])[0:10]
    dummy = []
    score_dummy = []
    for j in top10:
        a = j[0]
        b = j[1]
        name = df1['Name'][a]
        dummy.append(name)
        score_dummy.append(b)
    top10_person_area.append(dummy)
    score_person_area.append(score_dummy)

In [128]:
df1['similar_categories'] = top10_cat                 #adding the list to the dataframe
df1['category_score'] = score_cat
df1['similar_area'] = top10_area
df1['area_score'] = score_area

df4['similar_categories'] = pub_top10_cat
df4['category_score'] = pub_score_cat
df4['similar_area'] = pub_top10_area
df4['area_score'] = pub_score_area

df2['top10'] = pub_top10_person
df2['top10_scores']= pub_score_person

df3['top10'] = top10_person_area
df3['top10_scores']= score_person_area

In [129]:
cat_df= df1[['Name','Description_Title','similar_categories','category_score']]       #creating  new dataframes with the required columns
area_df = df1[['Name','Description_Title','similar_area','area_score']]
pub_cat_df= df4[['Name','descriptions','similar_categories','category_score']]
pub_area_df = df4[['Name','descriptions','similar_area','area_score']]

top10_authors_cat = df2[['Category','top10','top10_scores']]
top10_authors_area = df3[['area','top10','top10_scores']]

# View first 5 rows of the output dataframes

In [130]:
cat_df[:5]

Unnamed: 0,Name,Description_Title,similar_categories,category_score
0,giacomo francisci,,"[5g, virtualization, geosynthetics, electrocat...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,david kepplinger,david be an assistant professor of statistic...,"[digital value chain, puerto rico, native amer...","[0.6413945636668352, 0.6144660227796854, 0.548..."
2,ilhan izmirli,ilhan izmirli grow up surround by book and a...,"[puerto rico, native american, infrastructure,...","[0.6190585860273095, 0.5526818861547884, 0.545..."
3,jason goldberg,,"[5g, virtualization, geosynthetics, electrocat...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,kenneth pasiah,kenneth pasiahâs research interest include...,"[puerto rico, infrastructure, digital value ch...","[0.5343528749093224, 0.483698258770496, 0.4702..."


In [131]:
area_df[:5]

Unnamed: 0,Name,Description_Title,similar_area,area_score
0,giacomo francisci,,"[cybersecurity, biomaterials nanomedicine, neu...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,david kepplinger,david be an assistant professor of statistic...,"[autonomous systems, biomedical imaging device...","[0.1658902332014283, 0.16067560208171103, 0.15..."
2,ilhan izmirli,ilhan izmirli grow up surround by book and a...,"[autonomous systems, biomedical imaging device...","[0.18279855642941356, 0.15175922164473118, 0.1..."
3,jason goldberg,,"[cybersecurity, biomaterials nanomedicine, neu...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,kenneth pasiah,kenneth pasiahâs research interest include...,"[data analytics, applied probability statistic...","[0.20151708874395136, 0.17185435071243021, 0.1..."


In [132]:
pub_cat_df[:5]

Unnamed: 0,Name,descriptions,similar_categories,category_score
0,giacomo francisci,,"[5g, virtualization, geosynthetics, electrocat...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,david kepplinger,david be an assistant professor of statistic...,"[digital value chain, puerto rico, native amer...","[0.6413945636668352, 0.6144660227796854, 0.548..."
2,ilhan izmirli,ilhan izmirli grow up surround by book and a...,"[puerto rico, native american, infrastructure,...","[0.6190585860273095, 0.5526818861547884, 0.545..."
3,jason goldberg,,"[5g, virtualization, geosynthetics, electrocat...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,kenneth pasiah,kenneth pasiahâs research interest include...,"[puerto rico, infrastructure, digital value ch...","[0.5343528749093224, 0.483698258770496, 0.4702..."


In [133]:
pub_area_df[:5]

Unnamed: 0,Name,descriptions,similar_area,area_score
0,giacomo francisci,,"[cybersecurity, biomaterials nanomedicine, neu...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,david kepplinger,david be an assistant professor of statistic...,"[autonomous systems, biomedical imaging device...","[0.1658902332014283, 0.16067560208171103, 0.15..."
2,ilhan izmirli,ilhan izmirli grow up surround by book and a...,"[autonomous systems, biomedical imaging device...","[0.18279855642941356, 0.15175922164473118, 0.1..."
3,jason goldberg,,"[cybersecurity, biomaterials nanomedicine, neu...","[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,kenneth pasiah,kenneth pasiahâs research interest include...,"[data analytics, applied probability statistic...","[0.20151708874395136, 0.17185435071243021, 0.1..."


In [134]:
top10_authors_cat[:5]

Unnamed: 0,Category,top10,top10_scores
0,cognitive systems,"[md tanvir arafin, david lattanzi, pilgyu kang...","[0.09643959372630745, 0.07082408745875918, 0.0..."
1,intelligent agents,"[gregory stein, sean luke, wenying ji, paulo c...","[0.048837737566270426, 0.04330127018922194, 0...."
2,knowledge representation reasoning,"[paulo costa, filipe veiga, david lattanzi, ke...","[0.06539514552561412, 0.050599141255528085, 0...."
3,machine learning,"[ilhan izmirli, zoran duric, jyh ming lien, ke...","[0.10322383575366637, 0.09513029883089882, 0.0..."
4,natural language processing,"[david binning, david lattanzi, shima mohebbi,...","[0.19097885354754401, 0.18028869304837586, 0.1..."


In [135]:
top10_authors_area[:5]

Unnamed: 0,area,top10,top10_scores
0,artificial intelligence,"[nora mcdonald, omoche cheche agada, sai manoj...","[0.12388592261650216, 0.11899282346174592, 0.1..."
1,autonomous systems,"[david lattanzi, zoran duric, david binning, i...","[0.24080189735978125, 0.1853123291652753, 0.18..."
2,computer engineering,"[pilgyu kang, md tanvir arafin, zhi tian, omoc...","[0.07436365992237534, 0.07332355751067665, 0.0..."
3,computer networks communications,"[shima mohebbi, peggy brouse, pilgyu kang, zhi...","[0.12015693891575661, 0.10596258856520352, 0.1..."
4,cybersecurity,"[giacomo francisci, jason goldberg, saad karam...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


# Converting dataframes to csv

In [137]:
cat_df.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/cosine_similarity_category.csv")
area_df.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/cosine_similarity_area.csv")
pub_cat_df.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/pub_cosine_similarity_category.csv")
pub_area_df.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/pub_cosine_similarity_area.csv")
top10_authors_cat.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/top10_authors_cat.csv")
top10_authors_area.to_csv("C:/Users/Simon/Desktop/DAEN 690/Sprint4/NLP cosine simirality/output/top10_authors_area.csv")