# Make sure to restart the Runtime and have a clean state to work with

In [11]:
#
# --- Step 1: Install Required Libraries ---
#
!pip install pandas gradio networkx



In [23]:
import pandas as pd
import gradio as gr
import io
import os
import networkx as nx # Import networkx for graph operations and HITS
import re # For URL parsing in trimming function

# --- 1. CSVLoader Class (Single Responsibility: Loading data) ---
class CSVLoader:
    """
    Handles loading data from a CSV file.
    Follows SRP by being solely responsible for data acquisition.
    """
    def __init__(self, file_path: str):
        """
        Initializes the CSVLoader with the path to the CSV file.
        :param file_path: The full path to the CSV file.
        """
        self.file_path = file_path
        self.dataframe = None
        print(f"CSVLoader initialized with file_path: {self.file_path}") # Log

    def load_data(self) -> pd.DataFrame:
        """
        Loads the CSV content from the specified file path into a pandas DataFrame.
        Raises FileNotFoundError if the file is not found.
        Raises ValueError if essential columns are missing.
        :return: A pandas DataFrame containing the loaded data.
        """
        print(f"Attempting to load data from: {self.file_path}") # Log
        if not os.path.exists(self.file_path):
            print(f"FileNotFoundError: File does not exist at {self.file_path}") # Log
            raise FileNotFoundError(f"CSV file not found at: {self.file_path}")

        try:
            self.dataframe = pd.read_csv(self.file_path)
            print(f"Successfully loaded data. DataFrame shape: {self.dataframe.shape}") # Log
            return self.dataframe
        except Exception as e:
            print(f"IOError: Error loading CSV data: {e}") # Log
            raise IOError(f"Error loading CSV data from {self.file_path}: {e}")

# --- 2. PageRankAnalyzer Class (Single Responsibility: Analyzing PageRank data) ---
class PageRankAnalyzer:
    """
    Performs analysis on URL data to find worst PageRank candidates.
    """
    def __init__(self, dataframe: pd.DataFrame):
        """
        Initializes the PageRankAnalyzer with a pandas DataFrame containing URL, Folder_Depth, and PageRank.
        """
        required_columns = ['URL', 'Folder_Depth', 'PageRank']
        if not all(col in dataframe.columns for col in required_columns):
            print(f"ValueError: PageRank data missing required columns. Found: {dataframe.columns.tolist()}") # Log
            raise ValueError(
                f"PageRank data must contain 'URL', 'Folder_Depth', and 'PageRank' columns. "
                f"Found columns: {dataframe.columns.tolist()}"
            )
        # Ensure correct data types and handle missing values
        dataframe['Folder_Depth'] = pd.to_numeric(dataframe['Folder_Depth'], errors='coerce').fillna(-1).astype(int)
        dataframe['PageRank'] = pd.to_numeric(dataframe['PageRank'], errors='coerce')
        dataframe.dropna(subset=['PageRank'], inplace=True)

        self.dataframe = dataframe
        print(f"PageRankAnalyzer initialized. Processed DataFrame shape: {self.dataframe.shape}") # Log

    def find_worst_candidates(self, depth_level: int, top_n: int) -> pd.DataFrame:
        """
        Finds the top 'n' URL candidates with the worst (lowest) PageRank
        at a specific folder depth level.
        """
        print(f"Finding worst PageRank candidates for depth: {depth_level}, top_n: {top_n}") # Log
        if self.dataframe.empty:
            print("PageRank dataframe is empty.") # Log
            return pd.DataFrame(columns=['URL', 'Folder_Depth', 'PageRank'])

        filtered_df = self.dataframe[self.dataframe['Folder_Depth'] == depth_level].copy()
        print(f"Filtered PageRank DataFrame shape at depth {depth_level}: {filtered_df.shape}") # Log

        if filtered_df.empty:
            print(f"No PageRank entries found at depth level {depth_level}.") # Log
            return pd.DataFrame(columns=['URL', 'Folder_Depth', 'PageRank'])

        sorted_df = filtered_df.sort_values(by='PageRank', ascending=True)
        worst_candidates = sorted_df.head(top_n)

        # Apply URL trimming for display - REMOVED AS PER USER REQUEST
        # worst_candidates['URL'] = worst_candidates['URL'].apply(_trim_url_for_display)

        print(f"Found {len(worst_candidates)} worst PageRank candidates.") # Log
        return worst_candidates[['URL', 'Folder_Depth', 'PageRank']]

# --- 3. HITSGraphAnalyzer Class (Single Responsibility: Analyzing graph data with HITS) ---
class HITSGraphAnalyzer:
    """
    Performs HITS algorithm analysis on a link graph.
    """
    def __init__(self, edges_dataframe: pd.DataFrame, pagerank_dataframe: pd.DataFrame):
        """
        Initializes the HITSGraphAnalyzer with a pandas DataFrame containing 'FROM' and 'TO' columns
        and the pagerank_dataframe to merge Folder_Depth.
        :param edges_dataframe: DataFrame with 'FROM' and 'TO' columns representing directed edges.
        :param pagerank_dataframe: DataFrame containing 'URL' and 'Folder_Depth' for merging.
        """
        required_edges_columns = ['FROM', 'TO']
        if not all(col in edges_dataframe.columns for col in required_edges_columns):
            print(f"ValueError: Link graph data missing required columns. Found: {edges_dataframe.columns.tolist()}") # Log
            raise ValueError(
                f"Link graph data must contain 'FROM' and 'TO' columns. "
                f"Found columns: {edges_dataframe.columns.tolist()}"
            )

        self.graph = nx.DiGraph() # Create a directed graph
        for _, row in edges_dataframe.iterrows():
            self.graph.add_edge(row['FROM'], row['TO'])
        print(f"HITSGraphAnalyzer initialized. Graph has {self.graph.number_of_nodes()} nodes and {self.graph.number_of_edges()} edges.") # Log

        # Store pagerank_dataframe for merging folder depth later
        self.pagerank_dataframe = pagerank_dataframe[['URL', 'Folder_Depth']].copy()
        # Ensure Folder_Depth is correctly typed in the pagerank_dataframe
        self.pagerank_dataframe['Folder_Depth'] = pd.to_numeric(self.pagerank_dataframe['Folder_Depth'], errors='coerce').fillna(-1).astype(int)

    def calculate_hits_scores(self) -> pd.DataFrame:
        """
        Calculates Hub and Authority scores using the HITS algorithm and merges Folder_Depth.
        :return: A DataFrame with 'URL', 'Folder_Depth', 'Hub Score', and 'Authority Score'.
                 Returns an empty DataFrame if the graph is empty.
        """
        print("Calculating HITS scores...") # Log
        if self.graph.number_of_nodes() == 0:
            print("Graph is empty, returning empty HITS DataFrame.") # Log
            return pd.DataFrame(columns=['URL', 'Folder_Depth', 'Hub Score', 'Authority Score'])

        try:
            hubs, authorities = nx.hits(self.graph)
            print(f"HITS calculation complete. Found {len(hubs)} hubs and {len(authorities)} authorities.") # Log

            hits_data = []
            for node in self.graph.nodes():
                hits_data.append({
                    'URL': node,
                    'Hub Score': hubs.get(node, 0.0),
                    'Authority Score': authorities.get(node, 0.0)
                })

            hits_df = pd.DataFrame(hits_data)

            # Merge with pagerank_dataframe to get Folder_Depth
            merged_hits_df = pd.merge(
                hits_df,
                self.pagerank_dataframe,
                on='URL',
                how='left' # Use left merge to keep all HITS results
            )
            # Fill NaN Folder_Depth values for URLs not found in pagerank_dataframe
            merged_hits_df['Folder_Depth'].fillna(-1, inplace=True)
            merged_hits_df['Folder_Depth'] = merged_hits_df['Folder_Depth'].astype(int)

            merged_hits_df = merged_hits_df.sort_values(by='Authority Score', ascending=False).reset_index(drop=True)

            # Apply URL trimming for display - REMOVED AS PER USER REQUEST
            # merged_hits_df['URL'] = merged_hits_df['URL'].apply(_trim_url_for_display)

            print(f"HITS DataFrame created. Shape: {merged_hits_df.shape}") # Log
            return merged_hits_df[['URL', 'Folder_Depth', 'Hub Score', 'Authority Score']] # Ensure column order
        except Exception as e:
            print(f"RuntimeError: Error during HITS calculation: {e}") # Log
            raise RuntimeError(f"Error during HITS calculation: {e}")

# --- 4. GradioApp Class (Single Responsibility: UI interaction) ---
class GradioApp:
    """
    Manages the Gradio user interface for the URL analysis application.
    It composes CSVLoader, PageRankAnalyzer, and HITSGraphAnalyzer.
    """
    def __init__(self, pagerank_csv_path: str, link_graph_csv_path: str):
        """
        Initializes the GradioApp and attempts to load both CSV files at startup.
        """
        self.pagerank_csv_path = pagerank_csv_path
        self.link_graph_csv_path = link_graph_csv_path

        self.pagerank_analyzer = None
        self.hits_graph_analyzer = None
        self.initial_load_status = [] # List to store status messages
        self.full_pagerank_df = pd.DataFrame() # Store the full pagerank df for HITS analyzer

        print("GradioApp initialization started.") # Log
        # Attempt to load PageRank data
        try:
            print(f"Attempting to load PageRank data from {self.pagerank_csv_path}") # Log
            pagerank_loader = CSVLoader(self.pagerank_csv_path)
            self.full_pagerank_df = pagerank_loader.load_data() # Load and store full df
            self.pagerank_analyzer = PageRankAnalyzer(self.full_pagerank_df.copy()) # Pass a copy to analyzer
            self.initial_load_status.append(f"✅ PageRank data loaded from {self.pagerank_csv_path}")
        except (FileNotFoundError, IOError, ValueError) as e:
            self.initial_load_status.append(f"❌ Error loading PageRank CSV: {e}")
            self.pagerank_analyzer = None
            print(f"PageRank CSV load failed: {e}") # Log

        # Attempt to load Link Graph data for HITS
        try:
            print(f"Attempting to load Link Graph data from {self.link_graph_csv_path}") # Log
            link_graph_loader = CSVLoader(self.link_graph_csv_path)
            link_graph_df = link_graph_loader.load_data()
            # Pass the full_pagerank_df to HITSGraphAnalyzer for folder depth merging
            if not self.full_pagerank_df.empty:
                self.hits_graph_analyzer = HITSGraphAnalyzer(link_graph_df, self.full_pagerank_df)
            else:
                self.initial_load_status.append(f"⚠️ PageRank data empty, HITS may not have Folder_Depth info.")
                self.hits_graph_analyzer = HITSGraphAnalyzer(link_graph_df, pd.DataFrame(columns=['URL', 'Folder_Depth'])) # Pass empty df
            self.initial_load_status.append(f"✅ Link graph data loaded from {self.link_graph_csv_path}")
        except (FileNotFoundError, IOError, ValueError) as e:
            self.initial_load_status.append(f"❌ Error loading Link Graph CSV: {e}")
            self.hits_graph_analyzer = None
            print(f"Link Graph CSV load failed: {e}") # Log
        except RuntimeError as e:
            self.initial_load_status.append(f"❌ Error building graph for HITS: {e}")
            self.hits_graph_analyzer = None
            print(f"HITS graph build failed: {e}") # Log

        self.initial_load_status_str = "\n".join(self.initial_load_status)
        print("GradioApp initialization finished.") # Log


    def perform_analysis(self, analysis_type: str, depth_level: int, top_n: int):
        """
        Performs the selected analysis (PageRank or HITS) and returns updates for the DataFrame
        and status message. This function now prepares all updates in one go.
        :param analysis_type: 'PageRank' or 'HITS'.
        :param depth_level: Relevant for PageRank analysis.
        :param top_n: Number of top results.
        :return: A tuple containing a gr.update object for the dataframe and a status message string.
        """
        print(f"perform_analysis called with type: {analysis_type}, depth: {depth_level}, top_n: {top_n}") # Log

        # Default empty dataframe and generic headers
        empty_df = pd.DataFrame()

        status_msg = ""

        results_df = empty_df
        new_headers = [] # Will be set based on analysis_type

        if analysis_type == 'PageRank':
            if self.pagerank_analyzer is None:
                status_msg = self.initial_load_status_str + "\n\nCannot perform PageRank analysis: Data not loaded."
                print("PageRank analyzer is not initialized.") # Log
                new_headers = ['URL', 'Folder_Depth', 'PageRank'] # Fallback
            else:
                try:
                    results_df = self.pagerank_analyzer.find_worst_candidates(depth_level, top_n)
                    if results_df.empty:
                        status_msg = f"No PageRank candidates found at Depth Level {depth_level} or after filtering."
                        print("No PageRank candidates found.") # Log
                    else:
                        status_msg = f"Top {top_n} Worst PageRank Candidates at Depth Level {depth_level}:"
                        print(f"PageRank analysis successful, {len(results_df)} results.") # Log
                    new_headers = ['URL', 'Folder_Depth', 'PageRank'] # Explicitly set correct headers
                except Exception as e:
                    status_msg = f"An error occurred during PageRank analysis: {e}"
                    print(f"Error during PageRank analysis: {e}") # Log
                    new_headers = ['URL', 'Folder_Depth', 'PageRank'] # Fallback

        elif analysis_type == 'HITS':
            if self.hits_graph_analyzer is None:
                status_msg = self.initial_load_status_str + "\n\nCannot perform HITS analysis: Graph data not loaded."
                print("HITS graph analyzer is not initialized.") # Log
                new_headers = ['URL', 'Folder_Depth', 'Hub Score', 'Authority Score'] # Fallback
            else:
                try:
                    results_df = self.hits_graph_analyzer.calculate_hits_scores()
                    if results_df.empty:
                        status_msg = "No HITS scores calculated (empty graph)."
                        print("No HITS scores calculated.") # Log
                    else:
                        results_df = results_df.head(top_n) # Apply top_n here
                        status_msg = f"Top {top_n} HITS Authority/Hub Score Candidates:"
                        print(f"HITS analysis successful, {len(results_df)} results.") # Log
                    new_headers = ['URL', 'Folder_Depth', 'Hub Score', 'Authority Score'] # Explicitly set correct headers
                except Exception as e:
                    status_msg = f"An error occurred during HITS analysis: {e}"
                    print(f"Error during HITS analysis: {e}") # Log
                    new_headers = ['URL', 'Folder_Depth', 'Hub Score', 'Authority Score'] # Fallback

        else:
            status_msg = "Invalid analysis type selected."
            print(f"Invalid analysis type: {analysis_type}") # Log
            new_headers = ['URL', 'Score1', 'Score2'] # Fallback for unknown type

        # Determine datatype based on the number of headers
        # This is important as Gradio's `datatype` needs to match the actual data being returned
        if len(new_headers) == 3:
            new_datatype = ['str', 'number', 'number']
        elif len(new_headers) == 4:
            new_datatype = ['str', 'number', 'number', 'number']
        else:
            new_datatype = ['str', 'number', 'number'] # Fallback

        print(f"Returning DataFrame update with headers: {new_headers}, datatype: {new_datatype}, and status: {status_msg}") # Log
        # Use gr.update() which is the universal way to update component properties
        return gr.update(value=results_df, headers=new_headers, datatype=new_datatype, col_count=len(new_headers)), status_msg


    def run(self):
        """
        Launches the Gradio interface.
        """
        print("Launching Gradio demo...") # Log
        with gr.Blocks(title="URL & HITS Analyzer") as demo:
            gr.Markdown(
                """
                # URL & HITS Analyzer
                This application loads PageRank and link graph data from your mounted Google Drive.
                Please ensure your Google Drive is mounted correctly in Google Colab before running.
                Select an analysis type to view results.
                """
            )
            # Display initial load status for both files
            gr.Markdown("### File Loading Status:")
            gr.Markdown(self.initial_load_status_str)

            # New section for explanation
            gr.Markdown(
                """
                ## Understanding Results for Website Re-architecture

                This tool helps identify pages that are good candidates for improving your website's overall PageRank and structural authority.

                **PageRank Analysis:**
                * **Worst PageRank Candidates:** These are pages with low PageRank values, indicating they are not highly valued by the linking structure of your website (and potentially the broader web). Improving their internal linking (from high PageRank pages) or acquiring external backlinks can significantly boost their visibility and "link juice" distribution.

                **HITS Analysis:**
                The HITS algorithm provides a complementary view by identifying two types of influential pages:
                * **High Authority Score:** These pages are recognized as definitive sources of information on a topic (i.e., they are *pointed to* by many good hubs). If a page has a high Authority score but relatively low PageRank, it suggests the content is valuable, but it might not be receiving enough PageRank flow. Focus on internal linking from high-PageRank pages and external link building to these pages.
                * **High Hub Score:** These pages serve as excellent resource lists, pointing to many good authoritative pages. If a page has a high Hub score but low PageRank, it's a valuable navigational asset but isn't getting enough inbound link equity itself. Boosting the PageRank of such a hub (via internal/external links) will improve the "link juice" it passes to the authorities it links to.

                By understanding these scores, you can strategically re-architect your website's internal linking, content, and external link building efforts to maximize PageRank and improve overall SEO performance.
                """
            )

            with gr.Row():
                analysis_type_radio = gr.Radio(
                    ["PageRank", "HITS"],
                    label="Select Analysis Type",
                    value="PageRank", # Default to PageRank
                    interactive=True
                )

            with gr.Row():
                # PageRank specific inputs (will only be used if PageRank is selected)
                depth_level_input = gr.Slider(
                    minimum=0,
                    maximum=10,
                    step=1,
                    value=1,
                    label="Folder Depth Level (for PageRank)",
                    visible=True # Default to visible for PageRank initially
                )
                top_n_input = gr.Slider(
                    minimum=1,
                    maximum=100,
                    step=1,
                    value=5,
                    label="Number of Top Candidates (N)"
                )

            analyze_button = gr.Button("Perform Analysis")

            status_message_output = gr.Markdown("Analysis results will appear below.")
            output_dataframe = gr.Dataframe(
                # Removed initial headers, datatype, and col_count for full dynamic behavior
                # Gradio will infer these properties from the `value` provided by gr.update()
                row_count=5,
                interactive=True,
                label="Analysis Results",
                visible=True
            )

            # Define a function to control visibility of depth_level_input
            def update_depth_input_visibility(analysis_type):
                if analysis_type == "PageRank":
                    return gr.update(visible=True)
                else:
                    return gr.update(visible=False)

            # Bind the radio button change event to update the visibility of the depth slider
            analysis_type_radio.change(
                fn=update_depth_input_visibility,
                inputs=[analysis_type_radio],
                outputs=[depth_level_input]
            )

            # Bind the button click to the main analysis function.
            # This function now returns all the necessary updates directly.
            analyze_button.click(
                fn=self.perform_analysis,
                inputs=[analysis_type_radio, depth_level_input, top_n_input],
                outputs=[output_dataframe, status_message_output]
            )

        demo.launch()

# Main execution block
if __name__ == "__main__":
    # --- IMPORTANT: For Google Colab, mount your drive first ---
    from google.colab import drive
    print("Attempting to mount Google Drive...") # Log
    drive.mount('/content/drive')
    print("Google Drive mount command executed.") # Log

    # Define the paths to your CSV files in Google Drive
    pagerank_csv_path = '/content/drive/My Drive/master_july_2025/data/url_analysis_results.csv'
    link_graph_csv_path = '/content/drive/My Drive/master_july_2025/data/link_graph_edges.csv'

    print(f"PageRank CSV Path: {pagerank_csv_path}") # Log
    print(f"Link Graph CSV Path: {link_graph_csv_path}") # Log

    # Instantiate and run the Gradio application
    app = GradioApp(pagerank_csv_path=pagerank_csv_path, link_graph_csv_path=link_graph_csv_path)
    app.run()


Attempting to mount Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mount command executed.
PageRank CSV Path: /content/drive/My Drive/master_july_2025/data/url_analysis_results.csv
Link Graph CSV Path: /content/drive/My Drive/master_july_2025/data/link_graph_edges.csv
GradioApp initialization started.
Attempting to load PageRank data from /content/drive/My Drive/master_july_2025/data/url_analysis_results.csv
CSVLoader initialized with file_path: /content/drive/My Drive/master_july_2025/data/url_analysis_results.csv
Attempting to load data from: /content/drive/My Drive/master_july_2025/data/url_analysis_results.csv
Successfully loaded data. DataFrame shape: (884, 3)
PageRankAnalyzer initialized. Processed DataFrame shape: (884, 3)
Attempting to load Link Graph data from /content/drive/My Drive/master_july_2025/data/link_graph_edges.csv
CSVLoader initialized with file_path: /con