<a href="https://colab.research.google.com/github/HoarfrostRaven/BigData/blob/main/BGProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare the dataset

In [1]:
!wget https://snap.stanford.edu/data/web-Google.txt.gz
!gzip -d web-Google.txt.gz

--2025-03-28 22:21:25--  https://snap.stanford.edu/data/web-Google.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21168784 (20M) [application/x-gzip]
Saving to: ‘web-Google.txt.gz’


2025-03-28 22:21:26 (18.1 MB/s) - ‘web-Google.txt.gz’ saved [21168784/21168784]

gzip: web-Google.txt already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [2]:
! pwd

/content


In [3]:
! ls

sample_data  web-Google.txt  web-Google.txt.gz


In [1]:
# !pip install pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [5]:
# Load the file, ignore first 4 lines and split the data
data = (
    sc.textFile("/content/web-Google.txt")
    .filter(lambda line: line.strip() and not line.startswith('#'))
    .map(lambda line: tuple(map(int, line.split())))
)

In [6]:
print(data.take(5))

[(0, 11342), (0, 824020), (0, 867923), (0, 891835), (11342, 0)]


# CCF-Iterate

In [2]:
from pyspark import StorageLevel

def connected_components_ccf(data, num_partitions = 1):
    """
    Computes connected components using the CCF algorithm.

    Args:
        data: An RDD of edges represented as tuples (node1, node2).
              Example: sc.parallelize([(1, 2), (2, 3), (2, 4), (4, 5), (6, 7), (7, 8)])

    Returns:
        An RDD of edges representing the connected components.
        Example: sc.parallelize([(2, 1), (3, 1), (4, 1), (5, 1), (7, 6), (8, 6)])
    """

    # Initialize bidirectional edges with partition optimization
    edges = (data.flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])])
             .distinct()
             .repartition(num_partitions)
             .persist(StorageLevel.MEMORY_AND_DISK))  # Spill to disk if OOM

    converged = False
    iteration = 0
    prev_edges = None  # Track previous iteration's data

    while not converged:
        iteration += 1
        print(f"--- Iteration {iteration} ---")

        # Filter first to reduce data volume before reduce
        filtered = edges.filter(lambda x: x[1] < x[0]).cache()

        # Compute minimum neighbors with partition preservation
        min_values = filtered.reduceByKey(min, numPartitions=num_partitions).cache()
        filtered.unpersist()  # Release intermediate data immediately

        # Join with partition alignment to minimize shuffle
        new_edges = (min_values.join(edges, numPartitions=num_partitions)
                     .filter(lambda x: x[1][0] != x[1][1])  # Remove self-edges
                     .map(lambda x: (x[1][1], x[1][0]))     # Remap edges
                     .cache())

        if new_edges.isEmpty():
            converged = True
            result = min_values
        else:
            # Release data from two iterations back
            if prev_edges is not None:
                prev_edges.unpersist()

            # Track current edges for future cleanup
            prev_edges = edges

            # Update edges with partition optimization
            edges = (min_values.union(new_edges)
                     .flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])])
                     .repartition(num_partitions)
                     .persist(StorageLevel.MEMORY_AND_DISK))

            # Release current iteration's intermediates
            min_values.unpersist()
            new_edges.unpersist()

    # Final cleanup
    edges.unpersist()
    return result

In [3]:
# Example data for testing
test_data = sc.parallelize([(1, 2), (2, 3), (2, 4), (4, 5), (6, 7), (7, 8)])

# Run the connected_components_ccf algorithm
result = connected_components_ccf(test_data)

# Collect and print the result
print("Connected components:", result.collect())

--- Iteration 1 ---
--- Iteration 2 ---
--- Iteration 3 ---
--- Iteration 4 ---
Connected components: [(4, 1), (2, 1), (3, 1), (5, 1), (7, 6), (8, 6)]


In [None]:
# Compute connected components using the CCF algorithm
connected_components = connected_components_ccf(data, 200)

# Print the results
print(connected_components.collect())