In [2]:
import os
import urllib.request
import shutil
import gzip
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigs

# Task:
 Setting restart probability r = 1 − 0.85, compute the pageranks of all vertices on this graph. Reuse the power method function you wrote in Exercise: Power method for large graphs.

In [8]:

# The URL where the file is located
url = "https://snap.stanford.edu/data/web-Google.txt.gz"

# The local path where the file will be saved
file_path = "web-Google.txt.gz"

# Function to download the file
def download_file(url, file_path):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
    else:
        print("File already exists.")



In [10]:
# Function to parse the graph and create a sparse matrix
def parse_graph(file_path):
    print("Parsing the graph...")
    with gzip.open(file_path, 'rt') as file:
        edges = []
        nodes = set()
        for line in file:
            if line.startswith('#'):
                continue
            from_node, to_node = map(int, line.split())
            edges.append((from_node, to_node))
            nodes.update([from_node, to_node])
        # Create a sparse matrix from the edges
        node_count = max(nodes) + 1
        row_indices, col_indices = zip(*edges)
        data = np.ones(len(edges))
        return csr_matrix((data, (row_indices, col_indices)), shape=(node_count, node_count))



In [11]:
# Function to apply the power method to a stochastic matrix
def power_method(P, r=0.85, max_iter=1000, tol=1e-10):
    N = P.shape[0]
    x = np.random.rand(N)
    x /= np.sum(x)  # Normalize to make it a probability vector

    # Convert the matrix to a stochastic matrix (column sums to 1)
    P = P.multiply(1 / P.sum(axis=0))

    for i in range(max_iter):
        x_next = r * P.dot(x) + (1 - r) / N
        # Check for convergence
        if np.linalg.norm(x_next - x, 1) < tol:
            break
        x = x_next

    return x / np.sum(x)  # Return the normalized eigenvector

# Download the file
download_file(url, file_path)

# Parse the graph and create a sparse matrix
graph = parse_graph(file_path)

# Compute the pagerank using the power method
pageranks = power_method(graph)
print(pageranks)


File already exists.
Parsing the graph...


  P = P.multiply(1 / P.sum(axis=0))


[7.84971203e-07 4.07969414e-07 5.04867544e-07 ... 1.17275881e-06
 2.59725250e-07 2.06222833e-06]
