<a href="https://colab.research.google.com/github/HoarfrostRaven/BigData/blob/main/BGProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare the dataset

In [None]:
# !wget https://snap.stanford.edu/data/web-Google.txt.gz
# !gzip -d web-Google.txt.gz

In [None]:
! pwd

/content


In [None]:
! ls

sample_data  web-Google.txt  web-Google.txt.gz


In [1]:
# !pip install pyspark
from pyspark import SparkConf
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [None]:
# Load the file, ignore first 4 lines and split the data
data = (
    sc.textFile("/content/web-Google.txt")
    .filter(lambda line: line.strip() and not line.startswith('#'))
    .map(lambda line: tuple(map(int, line.split())))
)

In [None]:
print(data.take(5))

[(0, 11342), (0, 824020), (0, 867923), (0, 891835), (11342, 0)]


# CCF-Iterate

In [21]:
def connected_components_ccf1(data):
    """
    Computes connected components using the CCF algorithm.

    Args:
        data: An RDD of edges represented as tuples (node1, node2).
              Example: sc.parallelize([(1, 2), (2, 3), (2, 4), (4, 5), (6, 7), (7, 8)])

    Returns:
        An RDD of edges representing the connected components.
        Example: sc.parallelize([(2, 1), (3, 1), (4, 1), (5, 1), (7, 6), (8, 6)])
    """
    edges = data.flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])]).distinct()

    converged = False
    iteration = 0
    while not converged:
        iteration += 1
        print(f"--- Iteration {iteration} ---")

        min_value = (
            edges.reduceByKey(min)
            .filter(lambda x: x[1] < x[0])
        )

        new_created_edges = (
            min_value.join(edges)
            .map(lambda x: (x[1][1], x[1][0]))
            .filter(lambda x: x[0] != x[1])
        )

        if new_created_edges.isEmpty():
            converged = True
        else:
            edges = min_value.union(new_created_edges) \
                             .flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])]) \
                             .distinct().cache()
    return min_value

In [16]:
def connected_components_ccf(data):
    """
    Computes connected components using the CCF algorithm.

    Args:
        data: An RDD of edges represented as tuples (node1, node2).
              Example: sc.parallelize([(1, 2), (2, 3), (2, 4), (4, 5), (6, 7), (7, 8)])

    Returns:
        An RDD of edges representing the connected components.
        Example: sc.parallelize([(2, 1), (3, 1), (4, 1), (5, 1), (7, 6), (8, 6)])
    """
    # Initialize with bidirectional edges and remove duplicates
    edges = data.flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])]).distinct().cache()

    converged = False
    iteration = 0
    while not converged:
        iteration += 1
        print(f"--- Iteration {iteration} ---")

        # Compute the minimum neighbor for each node from current edges
        min_values = edges.reduceByKey(min)
        # Keep only entries where the min neighbor is smaller than the node itself
        min_values = min_values.filter(lambda x: x[1] < x[0]).cache()

        # Generate new edges by propagating the minimum values through the graph
        new_created_edges = min_values.join(edges).map(lambda x: (x[1][1], x[1][0])) \
                                      .filter(lambda x: x[0] != x[1]).cache()

        # Check if no new edges are created, indicating convergence
        if new_created_edges.isEmpty():
            converged = True
        else:
            # Update edges to include new edges and their reverse, maintaining bidirectionality
            edges = min_values.union(new_created_edges) \
                              .flatMap(lambda x: [(x[0], x[1]), (x[1], x[0])]) \
                              .distinct().cache()

    # Return the min_values as the result, representing each node's root
    return min_values

In [22]:
# Example data for testing
test_data = sc.parallelize([(1, 2), (2, 3), (2, 4), (4, 5), (6, 7), (7, 8)])

# Run the connected_components_ccf algorithm
result = connected_components_ccf1(test_data)

# Collect and print the result
print("Connected components:", result.collect())

--- Iteration 1 ---
--- Iteration 2 ---
--- Iteration 3 ---
--- Iteration 4 ---
Connected components: [(2, 1), (3, 1), (4, 1), (5, 1), (7, 6), (8, 6)]


In [None]:
# Compute connected components using the CCF algorithm
connected_components = connected_components_ccf(data)

# Print the results
print(connected_components.collect())

NameError: name 'data' is not defined

In [None]:
# 输出每个节点的最终连通分量ID（最小编号）
result = edges.collect()

for node, comp_id in result:
    print(f"{node} --> {comp_id}")