# Part1 - Prepare Dataset

## 1. Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import tqdm
from sklearn import metrics
import re
import ast
from sknetwork.utils import edgelist2adjacency, edgelist2biadjacency
from sknetwork.data import convert_edge_list
from sknetwork.clustering import Louvain
from sknetwork.ranking import PageRank, Katz,Betweenness,Closeness,Harmonic,HITS
from collections import Counter

## 2. Load the Datasets

In [None]:
df_train = pd.read_csv('train.csv', dtype={'author': np.int64, 'hindex': np.float32})
df_test = pd.read_csv('test.csv', dtype={'author': np.int64})
G = nx.read_edgelist('coauthorship.edgelist', delimiter=' ', nodetype=int)

df_train.shape, df_test.shape

((174241, 2), (43560, 3))

In [None]:
lines = []
with open('author_papers.txt') as f:
    lines = f.readlines()
    
# Clean up the format of the file and save the information as the dictionary
Author_papers = dict()
for l in lines:
    s = l[:-1].split(':')
    author = s[0]
    papers = s[1].split("-")
    Author_papers[author] = papers

# Print the first 4 elements of the dictionary
count = 0   
for key, value in Author_papers.items():
    count +=1
    print(key, ' : ', value)
    if count > 3:
        break
print(len(Author_papers))

1036332  :  ['1510273386', '1827736641', '1588673897', '2252711322', '2123653597']
1101850  :  ['133459021', '179719743', '2111787673', '2126488676', '31838995']
1336878  :  ['2122092249', '2132109814', '2100271871', '2065672539', '2036413831']
1515524  :  ['2141827797', '2127085795', '2013547785', '2138529788', '1994863898']
217801


In [None]:
abstract_lines = []
with open('abstracts.txt') as f:
    abstract_lines = f.readlines()
    
# Clean up the format of the file and save the information as the dictionary
count = 0
abstracts_dic = dict()
for line in abstract_lines:
    index, text = line.split("----")[0], "".join(line.split("----")[1:])
    dic = ast.literal_eval(text)
    abstract = ["" for i in range(dic["IndexLength"])]
    inv_indx=  dic["InvertedIndex"]
    for word in inv_indx:
      for idx in inv_indx[word]:
          abstract[idx]=word
    abstracts_dic[index] = " ".join(abstract)
    count+=1
    if count%100000 ==0:
      print(count)
print(len(abstracts_dic))

100000
200000
300000
400000
500000
600000
624181


In [None]:
# Print the first 4 elements of the dictionary
count = 0
for i, v in abstracts_dic.items():
  count+=1
  print(i)
  print(v)
  if count>3:
    break

3603
In this paper, we describe a new bitmap indexing technique to cluster XML documents. XML is a new standard for exchanging and representing information on the Internet. Documents can be hierarchically represented by XML-elements. XML documents are represented and indexed using a bitmap indexing technique. We define the similarity and popularity operations available in bitmap indexes and propose a method for partitioning a XML document set. Furthermore, a 2-dimensional bitmap index is extended to a 3dimensional bitmap index, called BitCube. We define statistical measurements in the BitCube: mean, mode, standard derivation, and correlation coefficient. Based on these measurements, we also define the slice, project, and dice operations on a BitCube. BitCube can be manipulated efficiently and improves the performance of document retrieval.
7383
The paper starts from the observation that in the inclusion-based approach to point-free geometry there are serious difficulties in defining po

## 3. Data Engineering

### 3.1 Author neughbours h-indexes

In [None]:
all_df = pd.concat([df_train, df_test[['author','hindex']]], ignore_index=True)
all_df = all_df.fillna(0)

all_df_dict = pd.Series(all_df.hindex.values,index=all_df.author).to_dict()

# Look at the neighbours of each node and save the information about their hindexes
# The minimum, maximum, mean, and std hindexes based on the neighbours
node_neighbour_h_index = dict()
for node in list(G.nodes()):
  h_index_list = []
  for neighbour in G.neighbors(node):
    h_index = all_df_dict[neighbour]
    if h_index !=0:
      h_index_list.append(h_index)  
  if h_index_list:
    node_neighbour_h_index[node] = [min(h_index_list) , max(h_index_list), round(np.mean(h_index_list),2), round(np.std(h_index_list),2) ]
  else:
    node_neighbour_h_index[node] = [0,0,0,0]

sf_node_neighbour_h_index  = pd.DataFrame(node_neighbour_h_index.items(),columns=["author","stats"])
sf_node_neighbour_h_index['min_neigh_index']=sf_node_neighbour_h_index['stats'].apply(lambda x: x[0])
sf_node_neighbour_h_index['max_neigh_index']=sf_node_neighbour_h_index['stats'].apply(lambda x: x[1])
sf_node_neighbour_h_index['mean_neigh_index']=sf_node_neighbour_h_index['stats'].apply(lambda x: x[2])
sf_node_neighbour_h_index['std_neigh_index']=sf_node_neighbour_h_index['stats'].apply(lambda x: x[3])
sf_node_neighbour_h_index = sf_node_neighbour_h_index.drop(columns=['stats'])
sf_node_neighbour_h_index.head()

Unnamed: 0,author,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index
0,2002218453,1.0,13.0,4.92,4.27
1,1999212242,1.0,13.0,5.4,5.43
2,2032640503,1.0,13.0,6.12,4.4
3,2475931411,1.0,13.0,6.5,5.55
4,2477743428,1.0,13.0,6.5,5.55


In [None]:
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

Number of nodes: 217801
Number of edges: 1718164


### 3.2. Author cluster size

In [None]:
# clustering

df = pd.read_csv('coauthorship.edgelist', delimiter=' ',names=['character_1', 'character_2'])
df.head()

Unnamed: 0,character_1,character_2
0,2002218453,1999212242
1,2002218453,2032640503
2,2002218453,2475931411
3,2002218453,2477743428
4,2002218453,2504846374


In [None]:
df = df.astype(str)
edge_list = list(df.itertuples(index=False))
graph = convert_edge_list(edge_list)

In [None]:
# Use Louvain method to find clusters in our graph
louvain = Louvain()
clusters = louvain.fit_transform(graph.adjacency)

In [None]:
# For each cluster calculate their sizes
cluster_size = Counter(clusters)
cluster_size

Counter({0: 27667,
         1: 19073,
         2: 18448,
         3: 13691,
         4: 11929,
         5: 10395,
         6: 9769,
         7: 7339,
         8: 4557,
         9: 4264,
         10: 4223,
         11: 3273,
         12: 2790,
         13: 2779,
         14: 2679,
         15: 2661,
         16: 2452,
         17: 2365,
         18: 2120,
         19: 2006,
         20: 1835,
         21: 1775,
         22: 1673,
         23: 1590,
         24: 1431,
         25: 1417,
         26: 1380,
         27: 1331,
         28: 1169,
         29: 1146,
         30: 1138,
         31: 1108,
         32: 1106,
         33: 1078,
         34: 1031,
         35: 1006,
         36: 976,
         37: 911,
         38: 898,
         39: 896,
         40: 892,
         41: 827,
         42: 778,
         43: 751,
         44: 747,
         45: 742,
         46: 736,
         47: 721,
         48: 639,
         49: 634,
         50: 633,
         51: 600,
         52: 591,
         53: 5

In [None]:
# Save the found data in the list
autrhor_cluser_num = list(zip(graph.names, clusters))
autrhor_cluser_num_size= [(x[0],x[1],cluster_size[x[1]] ) for x in autrhor_cluser_num]
autrhor_cluser_num_size[0:10]

[('100004310', 1, 19073),
 ('100004360', 4, 11929),
 ('1000062049', 15, 2661),
 ('1000070095', 0, 27667),
 ('100017046', 43, 751),
 ('100018398', 11, 3273),
 ('100025731', 1, 19073),
 ('100026995', 3, 13691),
 ('100029472', 7, 7339),
 ('100043250', 0, 27667)]

In [None]:
# Save the found data in the dataframe
authhor_cluster  = pd.DataFrame(autrhor_cluser_num_size,columns=['author','cluster_num','cluster_size'])
authhor_cluster.head()

Unnamed: 0,author,cluster_num,cluster_size
0,100004310,1,19073
1,100004360,4,11929
2,1000062049,15,2661
3,1000070095,0,27667
4,100017046,43,751


In [None]:
authhor_cluster.shape

(217801, 3)

In [None]:
sf_node_neighbour_h_index.head()

Unnamed: 0,author,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index
0,2002218453,1.0,13.0,4.92,4.27
1,1999212242,1.0,13.0,5.4,5.43
2,2032640503,1.0,13.0,6.12,4.4
3,2475931411,1.0,13.0,6.5,5.55
4,2477743428,1.0,13.0,6.5,5.55


In [None]:
authhor_cluster['author'] = authhor_cluster['author'].astype(int)

#### Perform a merge

In [None]:
merged_1 = pd.merge(authhor_cluster,sf_node_neighbour_h_index,how="left",on=['author'])
merged_1.head()

Unnamed: 0,author,cluster_num,cluster_size,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index
0,100004310,1,19073,1.0,19.0,10.0,9.0
1,100004360,4,11929,4.0,19.0,9.67,5.06
2,1000062049,15,2661,7.0,37.0,22.0,15.0
3,1000070095,0,27667,12.0,12.0,12.0,0.0
4,100017046,43,751,1.0,83.0,47.45,17.42


### 3.3 Author PageRank, Katz, HITS

#### Pagerank

In [None]:
pagerank = PageRank()
seeds = {0: 1}
PageRank_scores = np.round(pagerank.fit_transform(graph.adjacency, seeds),2)
print(PageRank_scores)
len(PageRank_scores)

[0.23 0.   0.   ... 0.   0.   0.  ]


217801

#### Katz

In [None]:
katz = Katz()
Katz_scores = np.round(katz.fit_transform(graph.adjacency),2)
print(Katz_scores)
len(Katz_scores)

[  70.5  3770.     38.19 ...  167.56   73.81    5.56]


217801

#### HITS

In [None]:
hits = HITS()
HITS_scores = np.round(hits.fit_transform(graph.adjacency),2)
print(HITS_scores)
len(HITS_scores)

[0. 0. 0. ... 0. 0. 0.]


217801

#### Merge everything

In [None]:
df2 = pd.DataFrame(list(zip(graph.names, PageRank_scores, Katz_scores, HITS_scores)),columns=['author','pagerank','katz','hits'])
df2.head()

Unnamed: 0,author,pagerank,katz,hits
0,100004310,0.23,70.5,0.0
1,100004360,0.0,3770.0,0.0
2,1000062049,0.0,38.19,0.0
3,1000070095,0.0,32.0,0.0
4,100017046,0.0,12034330000.0,0.0


In [None]:
df2['author'] =df2['author'].astype(int) 

In [None]:
merged_2 = pd.merge(merged_1,df2,how="left",on=['author'])
merged_2.head()

Unnamed: 0,author,cluster_num,cluster_size,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index,pagerank,katz,hits
0,100004310,1,19073,1.0,19.0,10.0,9.0,0.23,70.5,0.0
1,100004360,4,11929,4.0,19.0,9.67,5.06,0.0,3770.0,0.0
2,1000062049,15,2661,7.0,37.0,22.0,15.0,0.0,38.19,0.0
3,1000070095,0,27667,12.0,12.0,12.0,0.0,0.0,32.0,0.0
4,100017046,43,751,1.0,83.0,47.45,17.42,0.0,12034330000.0,0.0


### 3.4 Author: core number, degree, clustering coefficient, betweenness centrality

In [None]:
core_number = nx.core_number(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
clustering_coef = nx.clustering(G)
degree_centrality = nx.degree_centrality(G)
betweeness_centrality = nx.betweenness_centrality(G, k=50)

In [None]:
Author_papers_num_paper = {int(key): len(value) for key, value in Author_papers.items()}

#### Merge everything

In [None]:
merged_2['degree'] = merged_2['author'].apply(lambda x: G.degree(x))
merged_2['core_number'] = merged_2['author'].apply(lambda x: core_number[x])
merged_2['author_total_papers'] = merged_2['author'].apply(lambda x: Author_papers_num_paper[x])
merged_2['average_neigbour_degree'] = merged_2['author'].apply(lambda x: avg_neighbor_degree[x] )
merged_2['cluster_of_node'] = merged_2['author'].apply(lambda x: clustering_coef[x])
merged_2['degree_centrality'] = merged_2['author'].apply(lambda x: degree_centrality[x])
merged_2['approx_betweeness_centrality'] = merged_2['author'].apply(lambda x: betweeness_centrality[x])

In [None]:
merged_2.head()

Unnamed: 0,author,cluster_num,cluster_size,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index,pagerank,katz,hits,degree,core_number,author_total_papers,average_neigbour_degree,cluster_of_node,degree_centrality,approx_betweeness_centrality
0,100004310,1,19073,1.0,19.0,10.0,9.0,0.23,70.5,0.0,3,3,2,5.333333,1.0,1.4e-05,0.0
1,100004360,4,11929,4.0,19.0,9.67,5.06,0.0,3770.0,0.0,8,6,5,25.375,0.714286,3.7e-05,3.227828e-07
2,1000062049,15,2661,7.0,37.0,22.0,15.0,0.0,38.19,0.0,2,2,5,5.0,1.0,9e-06,0.0
3,1000070095,0,27667,12.0,12.0,12.0,0.0,0.0,32.0,0.0,1,1,5,7.0,0.0,5e-06,0.0
4,100017046,43,751,1.0,83.0,47.45,17.42,0.0,12034330000.0,0.0,703,569,5,651.583215,0.887835,0.003228,9.935685e-08


### 3.4 Textual

In [None]:
# Create a dictionary: key: author; value: abstract text
count = 0 
author_all_text = dict()
for author, papers_list in Author_papers.items():
  text=""
  if  papers_list:
    for paper in  papers_list:
      try:
        text_i = abstracts_dic[paper]
      except KeyError:
        continue
      text +=" " + text_i
    author_all_text[author]=text
  else:
    author_all_text[author]=" "

In [None]:
author_text_df = pd.DataFrame.from_dict(author_all_text.items())
author_text_df.columns = ["author","all_concatined_abstract"]
author_text_df.head()

Unnamed: 0,author,all_concatined_abstract
0,1036332,An underground utility conveyance (10) may be...
1,1101850,"In recent years, following the rapid developm..."
2,1336878,Probabilistic finite-state machines are used ...
3,1515524,Background: Three different techniques of ant...
4,1606427,A method and system automatically creates and...


In [None]:
def clean_text(df, col_tex, cleaning=False):
  """
  Returns dataframe with cleaned text column by performing some text preprocessing techniques.
  """
  data_frame = df.copy()
  data_frame['text_cleaned'] = data_frame[col_tex]
  data_frame['text_cleaned'] = data_frame['text_cleaned'].str.lower() # lowercase all the characters
  data_frame['text_cleaned'] = data_frame['text_cleaned'].str.replace(r"http\S+", "") # remove links
  data_frame['text_cleaned'] = data_frame['text_cleaned'].str.encode('ascii', 'ignore').str.decode('ascii') #remove non-ascii
  data_frame['text_cleaned'] = data_frame['text_cleaned'].str.replace(r"[@#$-_]", "") # remove extra characters
  data_frame['text_cleaned'] = data_frame['text_cleaned'].apply(lambda x: re.sub(r' +', ' ', x)) # replacement

  if cleaning==True:
    data_frame['text_cleaned'] = data_frame['text_cleaned'].apply(lambda x: re.sub(r'\d+', '@', x)) # replacement
    data_frame['text_cleaned'] = data_frame['text_cleaned'].apply(lambda x: re.sub(r'\n', '', x)) # replacement
    
    col_tokenced = col_tex+"_tokens"
    data_frame[col_tokenced] = data_frame['text_cleaned'].apply(word_tokenize)
    data_frame["text_cleaned"] = data_frame[col_tokenced].str.join(" ")
    
  return data_frame

In [None]:
author_text_df = clean_text(author_text_df, "all_concatined_abstract")
author_text_df.head()

Unnamed: 0,author,all_concatined_abstract,text_cleaned
0,1036332,An underground utility conveyance (10) may be...,an underground utility conveyance may be prec...
1,1101850,"In recent years, following the rapid developm...",in recent years following the rapid developme...
2,1336878,Probabilistic finite-state machines are used ...,probabilistic finitestate machines are used t...
3,1515524,Background: Three different techniques of ant...,background three different techniques of ante...
4,1606427,A method and system automatically creates and...,a method and system automatically creates and...


In [None]:
for row in author_text_df.sample(2).itertuples():
  print("cleaned:", row[-1])
  print("original:", row[-2])
  print("**************************************************")

cleaned:  with the rise of web applications most people started consuming information and sharing opinions and ideas about most aspects of their lives on a variety of social media platforms creating massive and continuous streams of valuable data while this opened the door for information extraction and mining techniques that can help us understand different aspects of society extracting useful information from such streams of web data is far from trivial in this setting sentiment analysis techniques can be convenient as they are capable of summarizing general feeling about entities people care about such as products and companies therefore they can be quite applicable in scenarios like the stock market which also has tremendous impact on society this paper describes and evaluates two different techniques for sentiment analysis applied to the brazilian stock market data lexiconbased and machine learning based considering a wide range of text preprocessing and feature selection approach

#### Merge

In [None]:
author_text_df["author"] = author_text_df["author"] .astype(int)

final_all_df = pd.merge(merged_2,author_text_df,how="left",on=['author'])

In [None]:
final_all_df = final_all_df.drop(columns=['all_concatined_abstract'])
final_all_df.shape

(217801, 18)

## 4. Concatination

In [None]:
final_all_df.head(2)

Unnamed: 0,author,cluster_num,cluster_size,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index,pagerank,katz,hits,degree,core_number,author_total_papers,average_neigbour_degree,cluster_of_node,degree_centrality,approx_betweeness_centrality,text_cleaned
0,100004310,1,19073,1.0,19.0,10.0,9.0,0.23,70.5,0.0,3,3,2,5.333333,1.0,1.4e-05,0.0,developing plc software for modern machine to...
1,100004360,4,11929,4.0,19.0,9.67,5.06,0.0,3770.0,0.0,8,6,5,25.375,0.714286,3.7e-05,3.227828e-07,heterogeneous multiprocessor systemsonchip mp...


In [None]:
df_train.head(2)

Unnamed: 0,author,hindex
0,1964267543,4.0
1,2153592714,13.0


In [None]:
df_train_all = pd.merge(df_train, final_all_df,on=["author"],how="left")
df_test_all = pd.merge(df_test, final_all_df,on=["author"],how="left")
df_test_all = df_test_all.drop(columns=['Unnamed: 0'])
df_train_all.shape, df_test_all.shape

((174241, 19), (43560, 19))

## 5. Save as a csv

In [None]:
df_train_all.describe()

Unnamed: 0,author,hindex,cluster_num,cluster_size,min_neigh_index,max_neigh_index,mean_neigh_index,std_neigh_index,pagerank,katz,hits,degree,core_number,author_total_papers,average_neigbour_degree,cluster_of_node,approx_betweeness_centrality
count,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0,174241.0
mean,2001807000.0,10.087608,22.883994,10172.009602,7.117739,29.077806,15.597002,7.553932,4e-06,114747600.0,0.000136,15.871345,12.273856,4.435764,26.493975,0.649452,3.3e-05
std,635005100.0,12.586828,39.235363,9306.839942,9.655978,24.540078,11.987866,7.600696,0.000516,1233072000.0,0.002332,68.7507,59.522235,1.192731,72.698752,0.387876,0.000431
min,1515524.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.75,0.0,1.0,1.0,1.0,1.111111,0.0,0.0
25%,2021314000.0,3.0,2.0,1417.0,2.0,12.0,7.5,1.0,0.0,122.19,0.0,2.0,2.0,5.0,6.75,0.311111,0.0
50%,2134751000.0,6.0,6.0,9769.0,4.0,22.0,12.67,5.72,0.0,585.25,0.0,4.0,3.0,5.0,11.5,0.818182,0.0
75%,2288318000.0,12.0,25.0,18448.0,9.0,39.0,20.33,11.5,0.0,3286.88,0.0,8.0,5.0,5.0,20.307692,1.0,5e-06
max,2908499000.0,187.0,293.0,27667.0,164.0,187.0,164.0,90.5,0.15,29839440000.0,0.04,1483.0,724.0,5.0,970.0,1.0,0.02996


In [None]:
df_train_all = df_train_all.drop(columns=['degree_centrality'])
df_test_all = df_test_all.drop(columns=['degree_centrality'])

In [None]:
df_train_all.to_csv("train_data_cleaned.csv", index=False)
df_test_all.to_csv("test_data_cleaned.csv", index=False)