### 1. Author embedding from abstract embedding
We'll consider the author representation to be the average of its paper's representations.

In [1]:
import ast
import numpy as np
from nltk.corpus import stopwords
import pandas as pd

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# load the paper embeddings
dict_paper_embeddings = {}
id_author = 0
vect_str = ""


with open("../Data/paper_embeddings_64.txt","r") as f_paper:
    
    for line in f_paper:
        if(':' in line):
            id_author, vect = line.split(':',1)
            vect_str = vect_str + vect.replace('\n','')
        else:
            vect_str += line.replace('\n','')
        
        if(']' in line):
            dict_paper_embeddings[id_author] = ast.literal_eval(vect_str.replace(' ',','))
            vect_str = ""

            
# load the list of authors in the training set and the h-index associated
f_train = open("../Data/train.csv","r")
list_lines = f_train.readlines()
dict_h_index = {}

for line in list_lines[1:]:
    line = line.replace("\n","").split(",",1)
    dict_h_index[int(line[0])] = float(line[1])

In [5]:
# print some exemples of paper embeddings
count = 0
for key,value in dict_paper_embeddings.items():
    if(count > 2):
        break
    print('(key, value)=({0},{1})'.format(key,value))
    count += 1

(key, value)=(3603,[1.95502567, 0.88846236, -2.28757286, -2.7875936, -1.93649995, -4.76704264, 4.0885787, -1.5846678, 2.7917676, -1.71815956, -1.72307014, -1.24436772, -2.01424241, 1.24009359, 0.72393638, 1.12063658, 2.70689917, 0.73714691, 1.57630599, -0.08534257, -2.05471015, 3.23041511, 0.47726882, 1.1646086, -2.74360585, 1.02150488, 0.41885796, 1.93923664, -0.77844548, -0.70080948, 1.13125944, -0.91544026, 0.34437126, 0.28214422, -0.99386036, 2.38796687, 1.85526562, 1.32367218, 1.39867067, 0.08051338, 0.68632078, -0.87475097, 2.62502384, -2.13532448, 0.91333014, 2.98717117, -1.29266083, 1.3762778, 0.30916947, 1.07678699, -1.50235331, -2.31246638, 1.69867396, -1.21003318, 0.24496719, -1.94066477, -1.09462011, -2.24982524, -0.98247027, -3.76954675, -0.61786431, -1.90634644, 3.11926484, -1.81494617])
(key, value)=(7383,[0.12814756, 0.63164562, -0.98055059, 2.10271621, 0.1266593, 0.53838509, 4.39952612, -5.95139265, -0.15614305, -0.5671345, -0.22147386, -0.07161764, 0.13969365, -0.8216

In [6]:
f_ = open('../Data/abstracts.txt','r',encoding="utf8") 
list_lines_ = f_.readlines()
print("number of abstracts: ",len(list_lines_))
f_.close()

f_ = open('../Data/abstracts_processed.txt','r',encoding="utf8") 
list_lines_ = f_.readlines()
print("number of preprocessed abstracts: ",len(list_lines_))
f_.close()

print('number of paper embeddings: ',len(dict_paper_embeddings))

number of abstracts:  624181
number of preprocessed abstracts:  624181
number of paper embeddings:  624181


In the *graph_features.ipynb* file, we have seen that there are however 683606 papers in the *author_papers.txt*. So there are papers without abstracts and potentially authors without any abstract. In this case, we'll consider the *null vector* as the embedding of that author. This decision is based on the fact that a null vector would not affect the calculations in our general models.

In [7]:
with open("../Data/author_papers.txt") as f_author_papers:
    for i in range(5):
        print(f_author_papers.readline())

1036332:1510273386-1827736641-1588673897-2252711322-2123653597

1101850:133459021-179719743-2111787673-2126488676-31838995

1336878:2122092249-2132109814-2100271871-2065672539-2036413831

1515524:2141827797-2127085795-2013547785-2138529788-1994863898

1606427:1907724546



In this part, we'll create the list of embeddings and save it later as .csv file using a pandas Data Frame. We also  calculate the average h-index of the paper authors. This measure will be used to analize a possible clustering based on the papers embeddings.

In [None]:
list_authors_id = []
list_embeddings = []
dict_author_embeddings = {}
dict_paper_hindex = {}
dict_list_paper_hindex = {}
list_aux_emb = []
list_aux_hindex = []

list_paper_not_embedding = []
list_authors_null_embeddig = []

dim_author_text_embedding = 64

with open("../Data/author_papers.txt") as f_author_papers:
    for line in f_author_papers:
        author, papers = line.split(':',1)
        papers = papers.replace('\n','').split('-')
        papers = list(map(int,papers))

        list_authors_id.append(author)
        for paper in papers:
            if(str(paper) in dict_paper_embeddings):
                list_aux_emb.append(dict_paper_embeddings[str(paper)])
                dict_list_paper_hindex[paper].append(dict_h_index[int(author)])
            else:
                list_paper_not_embedding.append(paper)
        
        author_emb = np.zeros(dim_author_text_embedding)
        aver_paper_hindex = 0
        
        if(len(list_aux) > 0):
            author_emb = np.array(list_aux_emb).mean(axis=0)
            aver_paper_hindex = np.array(list_aux_hindex).mean(axis=0)
        else:
            list_authors_null_embeddig.append(author)
        
        list_embeddings.append(author_emb)
        dict_author_embeddings[author] = author_emb
        dict_average_hindex[author] = aver_paper_hindex
        
        list_aux_emb = []
        list_aux_hindex = []

In [9]:
print("Two first author's id:\n",list_authors_id[:2])
print("\nEmbeddings of these two authors:\n", list_embeddings[:2])
print("\nFirst two papers without embedding (without an abstract):\n",list_paper_not_embedding[:2])

Two first author's id:
 ['1036332', '1101850']

Embeddings of these two authors:
 [array([-0.34260672, -0.96811582,  0.54728579, -0.07709934, -1.39273606,
        0.81066697,  1.9965124 , -0.76954713,  0.53001419, -2.3792789 ,
       -0.38602469, -1.85247579, -1.27895225, -1.48566678,  0.49702405,
        0.54610739, -0.36804133, -1.88826811, -2.1111477 , -1.22022029,
       -0.56621911,  0.44606569, -0.45144585, -1.2231652 ,  1.52057779,
       -0.64532068, -1.88196129,  1.71291772,  0.99850131,  1.88037358,
       -0.01732703, -1.57421604, -0.15491208,  0.2035263 ,  0.11807496,
        1.45447964, -0.86877872, -0.3217984 , -0.16963881,  1.03490481,
       -0.99414181,  1.98070169,  0.6859818 ,  0.48787495,  0.37763525,
        1.30986829,  0.91033524, -1.5677906 , -1.01534811, -2.30998271,
       -0.7721648 , -0.51481947, -1.10202755,  0.58657267,  0.93586812,
        1.73760075, -0.31252533,  1.25387316, -1.0862413 , -0.28046444,
        0.90778377,  1.34419771, -1.15006275, -1.2913

In [10]:
print("size of list_authors_id: ",len(list_authors_id))
print("size of list_embeddings: ", len(list_embeddings))
print("size of list_paper_not_embedding", len(list_paper_not_embedding))
print("size of list_authors_null_embeddig", len(list_authors_null_embeddig))

size of list_authors_id:  217801
size of list_embeddings:  217801
size of list_paper_not_embedding 75277
size of list_authors_null_embeddig 1880


In [11]:
'''
with open("../Data/author_embeddings_64.txt","w", encoding="utf8") as f_author_embeddings_text:
    for i in range(len(list_authors_id)):
        f_author_embeddings_text.write(str(list_authors_id[i])+":"+str(list_embeddings[i])+"\n")
'''
dim_author_text_embeddings = 64
cols_at_emb = ["at_embedding_"+str(i) for i in range(dim_author_text_embeddings)]

df_author_embeddings = pd.DataFrame()
df_author_embeddings['author_id'] = list_authors_id
df_author_embeddings[cols_at_emb] = np.array(list_embeddings)
df_author_embeddings.to_csv("author_embeddings_64.csv", index=False)

In [12]:
df_author_embeddings[:2]

Unnamed: 0,author_id,at_embedding_0,at_embedding_1,at_embedding_2,at_embedding_3,at_embedding_4,at_embedding_5,at_embedding_6,at_embedding_7,at_embedding_8,...,at_embedding_54,at_embedding_55,at_embedding_56,at_embedding_57,at_embedding_58,at_embedding_59,at_embedding_60,at_embedding_61,at_embedding_62,at_embedding_63
0,1036332,-0.342607,-0.968116,0.547286,-0.077099,-1.392736,0.810667,1.996512,-0.769547,0.530014,...,0.935868,1.737601,-0.312525,1.253873,-1.086241,-0.280464,0.907784,1.344198,-1.150063,-1.291351
1,1101850,0.40739,-0.316797,-1.194943,0.511457,-1.831023,1.071724,2.120155,-3.054826,-0.500682,...,-0.309047,2.726938,-0.941054,-1.271713,-1.979927,0.328079,0.502256,0.339255,0.726187,-0.077225


### 2.1 Bags Of Words (BOW) for the most frequent words
The representation of an abstract as a vector couting the number of occurrences of each word in all the abstracts.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import pandas as pd

def abstract_count_words():
    '''
    Count the number of non_stop words in each of the abstracts
    '''
    file =open("../Data/abstracts_processed.txt","r",encoding="utf8")
    stop_words = set(stopwords.words('english'))

    abstract_text=[]
    for abstract in file:      
        if(abstract=="\n"):
            continue
        paper_id,abstract = abstract.split("----",1)
        abstract_text.append(abstract.replace(","," "))
    file.close()        
    vectorizer = CountVectorizer(stop_words=stop_words, max_features=100)
 
    X = vectorizer.fit_transform(abstract_text)
    count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
    #print(count_vect_df)
    count_vect_df.to_json("../Data/bag_of_words.json",orient="records")
    
abstract_count_words()

In [None]:
import json

file_bow = open("../Data/bag_of_words.json","r",encoding="utf8")
data = json.load(file_bow)

In [None]:
print(data[:1])

### 2.2. BOW for all the words

The execution of the Bag Of Words (BOW) algorithm for all the abstracts is too computational expensive. We tried but it never finished.

In [35]:
f = open("../Data/abstracts_processed.txt")
for i in range(2):
    print(f.readline())

3603----in,this,paper,we,describe,a,new,bitmap,indexing,technique,to,cluster,xml,xml,is,a,new,standard,for,exchanging,and,representing,information,on,the,documents,can,be,hierarchically,represented,by,xml,documents,are,represented,and,indexed,using,a,bitmap,indexing,we,define,the,similarity,and,popularity,operations,available,in,bitmap,indexes,and,propose,a,method,for,partitioning,a,xml,document,furthermore,a,bitmap,index,is,extended,to,a,bitmap,index,called,we,define,statistical,measurements,in,the,mean,mode,standard,derivation,and,correlation,based,on,these,measurements,we,also,define,the,slice,project,and,dice,operations,on,a,bitcube,can,be,manipulated,efficiently,and,improves,the,performance,of,document,

7383----the,paper,starts,from,the,observation,that,in,the,approach,to,geometry,there,are,serious,difficulties,in,defining,these,difficulties,disappear,once,we,reformulate,this,approach,in,the,framework,of,continuous,multivalued,so,a,theory,of,is,proposed,as,a,counterpart,of,the,us

In [38]:
def abstract_distinct_words_count(filename="../Data/abstracts_processed.txt"):
    '''
    Count the number of non_stop words in each of the abstracts
    '''
    file =open(filename,"r")
#     fw = open("all_abstracts.txt","w")
    stop_words = set(stopwords.words('english')) 
    
     
  
    unique_abstract_words=[]
    for abstract in file:       
        if(abstract=="\n"):
            continue
        paper_id,abstract = abstract.split("----",1)
        words_in_abstract=abstract.split(",")
        for word in words_in_abstract:
            if word not in stop_words:
                if word not in unique_abstract_words:
                    unique_abstract_words.append(word)
       
    file.close()  
    print(unique_abstract_words)
    return len(unique_abstract_words)

abstract_distinct_words_count(filename="../Data/abstracts_processed_test.txt")

['paper', 'describe', 'new', 'bitmap', 'indexing', 'technique', 'cluster', 'xml', 'standard', 'exchanging', 'representing', 'information', 'documents', 'hierarchically', 'represented', 'indexed', 'using', 'define', 'similarity', 'popularity', 'operations', 'available', 'indexes', 'propose', 'method', 'partitioning', 'document', 'furthermore', 'index', 'extended', 'called', 'statistical', 'measurements', 'mean', 'mode', 'derivation', 'correlation', 'based', 'also', 'slice', 'project', 'dice', 'bitcube', 'manipulated', 'efficiently', 'improves', 'performance', '\n', 'starts', 'observation', 'approach', 'geometry', 'serious', 'difficulties', 'defining', 'disappear', 'reformulate', 'framework', 'continuous', 'multivalued', 'theory', 'proposed', 'counterpart', 'usual', 'second', 'considered', 'graded', 'predicates', 'assumed', 'cases', 'suitable', 'notion', 'abstractive', 'sequence', 'equivalence', 'sequences', 'enables', 'us', 'resulting', 'set', 'points', 'distance', 'defined', 'natural',

109