Use this cell for all your imports

In [30]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import networkx as nx
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
import plotly.graph_objs as go
from sklearn.preprocessing import normalize

You will be working with the file data.txt. Use this cell to load its content into the appropriate data structure.

In [31]:
# open the txt file and extract the source and the target nodes
f = open('data.txt', 'r')
list1, list2 = zip(*[x.split() for x in f.readlines()])
# Convert the lists into numpy arrays and remove the unnecessary elements
data1 = np.array(list1)
data2 = np.array(list2)

data1 = np.delete(data1 ,[0,1,2,3])
data2 = np.delete(data2 ,[0,1,2,3])

# Convert datatype to integers
source = data1.astype(int) # the source node
target = data2.astype(int) # the target node

print(source)
print(target)

[   1001    1001    1001 ... 9912286 9912286 9912286]
[9304045 9308122 9309097 ... 9808140 9810068 9901023]


In [32]:
n = len(source)
data = np.ones(n)

# Create the Adjacency matrix and store it in the coo format 
adjacency_matrix = coo_matrix((data, (source,target)))

Use this cell to normalize the matrix as you see fit.

In [33]:
#normalize matrix by rows
normalized_matrix = normalize(adjacency_matrix, norm='l1', axis=1)

Apply the PageRank algorithm to the matrix you created.

In [34]:
# Use the networkx library to implement the PageRank algorithm
graph = nx.from_scipy_sparse_array(normalized_matrix)
# PageRank scores of the nodes in the graph
pagerank_scores = nx.pagerank(graph)

Output a list of the 100 most important papers along with their importance

In [35]:
sorted_scores = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

# Top 100 papers and their PageRank scores :
for i, id_ in enumerate(sorted_scores[:100]):
    print(f" {i+1}. ID : {id_[0]} : {np.round(id_[1], decimals = 10)}")

 1. ID : 9711200 : 7.9453e-06
 2. ID : 9407087 : 6.1548e-06
 3. ID : 9802150 : 5.4118e-06
 4. ID : 9906064 : 5.0485e-06
 5. ID : 9908142 : 4.6387e-06
 6. ID : 9802109 : 4.6189e-06
 7. ID : 9610043 : 4.1941e-06
 8. ID : 9408099 : 3.9831e-06
 9. ID : 9510017 : 3.3993e-06
 10. ID : 9503124 : 3.1405e-06
 11. ID : 9905111 : 2.7682e-06
 12. ID : 9711162 : 2.6246e-06
 13. ID : 9510209 : 2.6048e-06
 14. ID : 9204099 : 2.5951e-06
 15. ID : 9405029 : 2.2829e-06
 16. ID : 9611050 : 2.2142e-06
 17. ID : 9510135 : 2.1863e-06
 18. ID : 9410167 : 1.9416e-06
 19. ID : 9205068 : 1.9396e-06
 20. ID : 9210010 : 1.8714e-06
 21. ID : 9601029 : 1.8691e-06
 22. ID : 9409089 : 1.8632e-06
 23. ID : 9401139 : 1.7893e-06
 24. ID : 9204064 : 1.7667e-06
 25. ID : 9411149 : 1.7248e-06
 26. ID : 9412184 : 1.671e-06
 27. ID : 9603142 : 1.6403e-06
 28. ID : 9301042 : 1.6188e-06
 29. ID : 9401153 : 1.4752e-06
 30. ID : 9306002 : 1.4719e-06
 31. ID : 9602022 : 1.4221e-06
 32. ID : 9201056 : 1.4204e-06
 33. ID : 9204083 

Visualize your results

In [36]:
top_scores = [score for _, score in sorted_scores[:100]]

trace = go.Scatter(
    x=list(range(1, 101)), 
    y=top_scores, 
    mode="lines"
)

layout = go.Layout(
    title="PageRank scores",
    xaxis=dict(title="Node ID"),
    yaxis=dict(title="Score")
)

fig = go.Figure(data=[trace], layout=layout)
fig.show()