In [None]:
from pickle import TRUE
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# pip install -r requirements.txt first

In [None]:
# File: notebooks/pagerank_ui.ipynb - Cell 1

# Installing necessary packages
# !pip install pandas networkx -q
# !pip install fireducks
# Note: Ensure all dependencies from requirements.txt are installed for full functionality across modules

import sys
import os
import warnings

# Suppress common warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")

# Add the project root to the Python path
project_root = "/content/drive/My Drive/WebKnoGraph" # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import pandas as pd
import fireducks.pandas as fpd
import networkx as nx
from urllib.parse import urlparse

# Specific imports for PageRank Analysis
from src.backend.config.pagerank_config import PageRankConfig
from src.backend.utils.url_processing import URLProcessor
from src.backend.graph.analyzer import PageRankGraphAnalyzer, HITSGraphAnalyzer
from src.backend.services.pagerank_service import PageRankService
from src.shared.logging_config import ConsoleAndGradioLogger

print("All modules imported successfully!")

In [None]:
# File: notebooks/pagerank_ui.ipynb - Cell 2

def run_pagerank_analysis_ui(input_edge_list_path: str, output_analysis_path: str, analysis_type: str, depth_level: int, top_n: int):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream, logger_name="PageRankLogger")

    try:
        config = PageRankConfig(
            input_edge_list_path=input_edge_list_path,
            output_analysis_path=output_analysis_path
        )

        logger.info("Initializing PageRank Service...")
        service = PageRankService(config, logger)

        # Call initial_data_load directly, it returns status messages
        # No arguments are passed to initial_data_load anymore
        initial_load_status_msg = service.initial_data_load()
        logger.info(initial_load_status_msg)

        # Ensure that the output file is generated or exists for display
        if not os.path.exists(config.output_analysis_path):
             # Try to generate it if it's missing, this is the 'Train' like step
            logger.info("Output analysis CSV not found. Attempting to generate it now...")
            try:
                service.process_and_save_pagerank() # This method processes and saves
                logger.info(f"Generated analysis results to: {config.output_analysis_path}")
            except Exception as e:
                logger.error(f"Failed to generate analysis results: {e}. Cannot proceed with display.")
                return gr.update(value=pd.DataFrame(), headers=['Error'], datatype=['str']), \
                       f"Analysis Failed! Error generating results: {e}", \
                       f"Initial load status:\n{initial_load_status_msg}" + f"\nError generating results: {e}"

        # Now, perform the specific analysis (PageRank filter or HITS)
        results_df, status_msg, new_headers, new_datatype = service.perform_analysis(
            analysis_type=analysis_type,
            depth_level=depth_level,
            top_n=top_n
        )

        # Return outputs for Gradio
        return gr.update(value=results_df, headers=new_headers, datatype=new_datatype, col_count=len(new_headers)), \
               status_msg, \
               f"Initial load status:\n{initial_load_status_msg}\n\nAnalysis Logs:\n{log_stream.getvalue()}"


    except FileNotFoundError as e:
        logger.error(f"File not found error: {e}")
        return gr.update(value=pd.DataFrame(), headers=['Error'], datatype=['str']), \
               "Analysis Failed!", \
               f"Error: {e}\n\nInitial load status:\n{log_stream.getvalue()}"
    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        return gr.update(value=pd.DataFrame(), headers=['Error'], datatype=['str']), \
               "Analysis Failed!", \
               f"Error: {e}\\n\\nInitial load status:\\n{log_stream.getvalue()}"

In [None]:
# File: notebooks/pagerank_ui.ipynb - Cell 3

# Need to instantiate the PageRankService here to pass its methods to Gradio components
# This also handles the initial data loading and sets up analyzers.
# The initial_load_status_str will be updated by the service's __init__ method.
# A dummy logger is used for the service's initialization phase to capture logs that happen before the main UI logger.
temp_log_stream = io.StringIO()
temp_logger = ConsoleAndGradioLogger(temp_log_stream, logger_name="PageRankAppInitLogger")
config_for_init = PageRankConfig() # Instantiate config once

# --- START DEBUG PRINTS (Notebook Level) ---
print(f"DEBUGGING INITIAL LOAD PATHS (Notebook Level):")
print(f"DEBUG: config_for_init.output_analysis_path is: {config_for_init.output_analysis_path}")
print(f"DEBUG: config_for_init.input_edge_list_path is: {config_for_init.input_edge_list_path}")
print(f"--- END DEBUG PRINTS (Notebook Level) ---")
# --- END DEBUG PRINTS ---


# Instantiate the service at the module level for Gradio to access its methods
pagerank_service_instance = PageRankService(config_for_init, temp_logger)

# This initial call to load data is for displaying the *initial* status in the UI
# IMPORTANT: This call no longer passes explicit path arguments.
initial_load_status_message = pagerank_service_instance.initial_data_load()


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📊 URL PageRank & Folder Depth Analysis")
    gr.Markdown("This tool loads an edge list CSV, calculates PageRank and folder depths for each URL, and saves the results to a new CSV file.")

    # Display initial load status
    gr.Markdown("### Initial Data Load Status:")
    initial_load_status_output = gr.Markdown(initial_load_status_message)


    gr.Markdown(
        """
        ## Understanding Results for Website Re-architecture

        This tool helps identify pages that are good candidates for improving your website's overall PageRank and structural authority.

        **PageRank Analysis:**
        * **Worst PageRank Candidates:** These are pages with low PageRank values, indicating they are not highly valued by the linking structure of your website (and potentially the broader web). Improving their internal linking (from high PageRank pages) or acquiring external backlinks can significantly boost their visibility and "link juice" distribution.

        **HITS Analysis:**
        The HITS algorithm provides a complementary view by identifying two types of influential pages:
        * **High Authority Score:** These pages are recognized as definitive sources of information on a topic (i.e., they are *pointed to* by many good hubs). If a page has a high Authority score but relatively low PageRank, it suggests the content is valuable, but it might not be receiving enough PageRank flow. Focus on internal linking from high-PageRank pages and external link building to these pages.
        * **High Hub Score:** These pages serve as excellent resource lists, pointing to many good authoritative pages. If a page has a high Hub score but low PageRank, it's a valuable navigational asset but isn't getting enough inbound link equity itself. Boosting the PageRank of such a hub (via internal/external links) will improve the "link juice" it passes to the authorities it links to.

        By understanding these scores, you can strategically re-architect your website's internal linking, content, and external link building efforts to maximize PageRank and improve overall SEO performance.
        """
    )


    with gr.Row():
        analysis_type_radio = gr.Radio(
            ["PageRank", "HITS"],
            label="Select Analysis Type",
            value="PageRank", # Default to PageRank
            interactive=True
        )

    with gr.Row():
        input_path_box = gr.Textbox(
            label="Input Edge List CSV Path",
            value=config_for_init.input_edge_list_path # Use config default
        )
        output_path_box = gr.Textbox(
            label="Output Analysis CSV Path",
            value=config_for_init.output_analysis_path # Use config default
        )

        depth_level_input = gr.Slider(
            minimum=0,
            maximum=10,
            step=1,
            value=1,
            label="Folder Depth Level (for PageRank)",
            visible=True # Default to visible for PageRank initially
        )
        top_n_input = gr.Slider(
            minimum=1,
            maximum=100,
            step=1,
            value=5,
            label="Number of Top Candidates (N)"
        )

    analyze_button = gr.Button("Perform Analysis")

    status_output = gr.Textbox(label="Current Status", interactive=False)
    log_output = gr.Textbox(label="Analysis Logs", interactive=False, lines=15)
    results_dataframe = gr.Dataframe(
        value=pd.DataFrame(columns=['URL', 'Folder_Depth', 'PageRank']), # Initial empty state for PageRank
        row_count=5,
        interactive=True,
        label="Analysis Results",
        visible=True
    )

    # Define a function to control visibility of depth_level_input
    def update_depth_input_visibility(analysis_type):
        if analysis_type == "PageRank":
            return gr.update(visible=True)
        else:
            return gr.update(visible=False)

    # Bind the radio button change event to update the visibility of the depth slider
    analysis_type_radio.change(
        fn=update_depth_input_visibility,
        inputs=[analysis_type_radio],
        outputs=[depth_level_input]
    )

    # Bind the button click to the main analysis function.
    analyze_button.click(
        fn=run_pagerank_analysis_ui, # Call the wrapper function from Cell 2
        inputs=[
            input_path_box,
            output_path_box,
            analysis_type_radio,
            depth_level_input,
            top_n_input
        ],
        outputs=[results_dataframe, status_output, log_output]
    )

In [None]:
# File: notebooks/pagerank_ui.ipynb - Cell 4

if __name__ == '__main__':
    try:
        from google.colab import drive
        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print(f"Could not launch Gradio demo in this environment: {e}")