In [None]:
!pip install pandas networkx -q
!pip install fireducks

In [None]:
# import pandas as pd
import fireducks.pandas as pd
import networkx as nx
from urllib.parse import urlparse
import os

# --- Google Colab Specific: Import and Mount Google Drive ---
# If you are running this code in Google Colab, uncomment the following lines
# and run them first to mount your Google Drive.
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
# -----------------------------------------------------------


# 1. Single Responsibility Principle (SRP) for URL processing
class URLProcessor:
    """
    Handles URL-related operations, specifically calculating folder depth.
    """

    @staticmethod
    def get_folder_depth(url: str) -> int:
        """
        Calculates the folder depth of a given URL.
        Example: https://kalicube.com/learning-spaces/faq-list/generative-ai/ -> 2
        """
        parsed_url = urlparse(url)
        path = parsed_url.path
        if not path or path == "/":
            return 0
        # Remove leading/trailing slashes and split by '/'
        segments = [s for s in path.strip("/").split("/") if s]
        return len(segments)


# 2. Single Responsibility Principle (SRP) for Graph operations
class GraphAnalyzer:
    """
    Handles graph construction and PageRank calculation.
    """

    def __init__(self, edges_df: pd.DataFrame):
        """
        Initializes the GraphAnalyzer with a DataFrame of graph edges.
        """
        if edges_df.shape[1] < 2:
            raise ValueError(
                "Input DataFrame must have at least two columns for source and target URLs."
            )
        self.edges_df = edges_df
        self.graph = self._build_graph()

    def _build_graph(self) -> nx.DiGraph:
        """
        Builds a directed graph from the provided edges DataFrame.
        """
        G = nx.DiGraph()
        # Get all unique URLs from both columns to ensure all nodes are added
        all_urls = pd.concat(
            [self.edges_df.iloc[:, 0], self.edges_df.iloc[:, 1]]
        ).unique()
        G.add_nodes_from(all_urls)

        for _, row in self.edges_df.iterrows():
            source = row.iloc[0]
            target = row.iloc[1]
            G.add_edge(source, target)
        return G

    def calculate_pagerank(self) -> dict:
        """
        Calculates PageRank for all nodes in the graph.
        Returns a dictionary mapping URLs to their PageRank scores.
        """
        return nx.pagerank(self.graph)

    def get_all_nodes(self) -> list:
        """
        Returns a list of all unique nodes (URLs) in the graph.
        """
        return list(self.graph.nodes())


# 3. Orchestration and File I/O (SRP, Dependency Inversion Principle)
class LinkGraphProcessor:
    """
    Orchestrates the loading of data, analysis, and saving of results.
    Depends on abstractions (URLProcessor, GraphAnalyzer) rather than
    concrete implementations directly (DIP).
    """

    def __init__(self, url_processor: URLProcessor, graph_analyzer_class):
        """
        Initializes the processor with an instance of URLProcessor and
        the class for GraphAnalyzer (for dependency injection).
        """
        self.url_processor = url_processor
        self.graph_analyzer_class = graph_analyzer_class

    def process_graph_data(self, input_filepath: str, output_filepath: str):
        """
        Loads graph data, performs analysis, and saves the results.
        """
        if not os.path.exists(input_filepath):
            raise FileNotFoundError(f"Input file not found: {input_filepath}")

        print(f"Loading data from {input_filepath}...")
        df_edges = pd.read_csv(input_filepath)
        print("Data loaded successfully.")

        graph_analyzer = self.graph_analyzer_class(df_edges)
        all_urls = graph_analyzer.get_all_nodes()

        print("Calculating folder depths...")
        folder_depths = {
            url: self.url_processor.get_folder_depth(url) for url in all_urls
        }
        print("Folder depths calculated.")

        print("Calculating PageRank scores...")
        pagerank_scores = graph_analyzer.calculate_pagerank()
        print("PageRank scores calculated.")

        print("Compiling results...")
        results_df = pd.DataFrame(
            {
                "URL": all_urls,
                "Folder_Depth": [folder_depths[url] for url in all_urls],
                "PageRank": [
                    pagerank_scores.get(url, 0.0) for url in all_urls
                ],  # Ensure float for PageRank
            }
        )

        print(f"Saving results to {output_filepath}...")
        results_df.to_csv(output_filepath, index=False)
        print("Results saved successfully.")
        print(f"\nExample of results (first 5 rows):\n{results_df.head()}")


# Example Usage:
if __name__ == "__main__":
    # Define your input file path on Google Drive
    # This path assumes your Google Drive is mounted at '/content/drive'
    input_file = "/content/drive/My Drive/master_july_2025/data/link_graph_edges.csv"

    # Dynamically construct the output file path to be in the same directory
    output_directory = os.path.dirname(input_file)
    output_file = os.path.join(
        output_directory, "url_analysis_results.csv"
    )  # Changed filename as requested

    # Instantiate the processor with its dependencies
    processor = LinkGraphProcessor(
        url_processor=URLProcessor(), graph_analyzer_class=GraphAnalyzer
    )

    try:
        processor.process_graph_data(input_file, output_file)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print(
            "Please ensure your Google Drive is mounted correctly and the input file path is accurate."
        )
    except Exception as e:
        print(f"An unexpected error occurred: {e}")