In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

# pip install -r requirements.txt first

In [None]:
# File: notebooks/link_prediction_ui.ipynb - Cell 1

# Installing necessary packages
# !pip install -q torch torch-geometric pandas duckdb pyarrow networkx gradio -q

import sys
import os
import warnings

# Suppress common warnings
warnings.filterwarnings(
    "ignore", category=FutureWarning, module="huggingface_hub.file_download"
)

# Add the project root to the Python path
project_root = "/content/drive/My Drive/WebKnoGraph"  # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive

    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import pandas as pd
import torch
import json

# Specific imports for Link Prediction Pipeline
from src.backend.config.link_prediction_config import LinkPredictionConfig
from src.backend.data.graph_dataloader import GraphDataLoader
from src.backend.data.graph_processor import GraphDataProcessor
from src.backend.models.graph_models import GraphSAGEModel
from src.backend.utils.url_processing import URLProcessor
from src.backend.services.graph_training_service import LinkPredictionTrainer
from src.backend.services.recommendation_engine import RecommendationEngine
from src.shared.logging_config import ConsoleAndGradioLogger

print("All modules imported successfully!")

In [None]:
# File: notebooks/link_prediction_ui.ipynb - Cell 2


def get_all_nodes_for_dropdown():
    """
    Dynamically loads node URLs from the saved model metadata (artifacts).
    If artifacts are not found, it returns a message indicating training is needed.
    """
    log_stream_dummy = io.StringIO()
    logger_dummy = ConsoleAndGradioLogger(
        log_stream_dummy, logger_name="DropdownLogger"
    )

    try:
        config = LinkPredictionConfig()
        model_metadata_path = config.node_mapping_path

        if os.path.exists(model_metadata_path):
            with open(model_metadata_path, "r") as f:
                model_metadata = json.load(f)
            if "url_to_idx" in model_metadata:
                url_to_idx = model_metadata["url_to_idx"]
                return sorted(list(url_to_idx.keys()))
            else:
                logger_dummy.error("Model metadata is incomplete (missing url_to_idx).")
                return ["Error: Model metadata is incomplete (missing url_to_idx)."]
        else:
            logger_dummy.info(
                "Model artifacts not found. Run training first to generate artifacts."
            )
            return ["Run training first to generate artifacts."]
    except Exception as e:
        logger_dummy.exception(f"Could not load URLs for dropdown from artifacts: {e}")
        return [
            f"Could not load URLs from artifacts: {e}. Ensure Google Drive is mounted and artifacts exist."
        ]


def run_training_pipeline(
    csv_path,
    embeddings_path,
    hidden_channels,
    out_channels,
    lr,
    epochs,
    progress=gr.Progress(track_tqdm=True),
):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(
        log_stream, logger_name="LinkPredictionTrainerLogger"
    )

    dropdown_choices = get_all_nodes_for_dropdown()

    try:
        yield "Step 1/5: Initializing...", log_stream.getvalue(), None, dropdown_choices
        config = LinkPredictionConfig(
            edge_csv_path=csv_path,
            embeddings_dir_path=embeddings_path,
            hidden_channels=int(hidden_channels),
            out_channels=int(out_channels),
            learning_rate=lr,
            epochs=int(epochs),
        )
        os.makedirs(config.output_dir, exist_ok=True)

        all_artifacts_exist = (
            os.path.exists(config.model_state_path)
            and os.path.exists(config.node_embeddings_path)
            and os.path.exists(config.node_mapping_path)
            and os.path.exists(config.edge_index_path)
        )

        if all_artifacts_exist:
            status_message = (
                "✅ All artifacts already exist. Skipping training and saving."
            )
            logger.info(status_message)
            dropdown_choices = get_all_nodes_for_dropdown()
            yield (
                status_message,
                log_stream.getvalue(),
                pd.DataFrame(
                    {
                        "Message": [
                            "Artifacts found. You can now use the recommendation tab."
                        ]
                    },
                    columns=["Message"],
                ),
                dropdown_choices,
            )
            return

        yield (
            "Step 2/5: Loading & processing data...",
            log_stream.getvalue(),
            None,
            dropdown_choices,
        )
        loader = GraphDataLoader(config, logger)
        node_features_df, edge_list_df = loader.load_data()
        processor = GraphDataProcessor(logger)
        data, url_to_idx = processor.process(node_features_df, edge_list_df)

        yield (
            "Step 3/5: Initializing model...",
            log_stream.getvalue(),
            None,
            dropdown_choices,
        )
        model = GraphSAGEModel(
            in_channels=data.num_node_features,
            hidden_channels=config.hidden_channels,
            out_channels=config.out_channels,
        )
        trainer = LinkPredictionTrainer(model, data, config, logger)

        yield (
            "Step 4/5: Training model...",
            log_stream.getvalue(),
            None,
            dropdown_choices,
        )
        for epoch, loss in progress.tqdm(
            trainer.train(), total=config.epochs, desc="Training Model"
        ):
            if epoch % 10 == 0 or epoch == 1:
                logger.info(f"Epoch {epoch}/{config.epochs}, Loss: {loss:.4f}")

        yield (
            "Step 5/5: Evaluating and saving artifacts...",
            log_stream.getvalue(),
            None,
            dropdown_choices,
        )
        model.eval()
        with torch.no_grad():
            final_node_embeddings = model(data.x, data.edge_index)

        logger.info(f"Saving model metadata to {config.node_mapping_path}")
        model_metadata = {
            "url_to_idx": url_to_idx,
            "in_channels": data.num_node_features,
            "hidden_channels": config.hidden_channels,
            "out_channels": config.out_channels,
        }
        with open(config.node_mapping_path, "w") as f:
            json.dump(model_metadata, f, indent=2)

        logger.info(f"Saving model weights to {config.model_state_path}")
        torch.save(model.state_dict(), config.model_state_path)
        logger.info(f"Saving final node embeddings to {config.node_embeddings_path}")
        torch.save(final_node_embeddings, config.node_embeddings_path)
        logger.info(f"Saving edge index to {config.edge_index_path}")
        torch.save(data.edge_index, config.edge_index_path)

        final_status = "✅ Pipeline Finished Successfully!"
        logger.info(final_status)
        dropdown_choices = get_all_nodes_for_dropdown()
        yield (
            final_status,
            log_stream.getvalue(),
            pd.DataFrame(
                {
                    "Message": [
                        "Artifacts saved successfully. You can now use the recommendation tab."
                    ]
                },
                columns=["Message"],
            ),
            dropdown_choices,
        )

    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        dropdown_choices = get_all_nodes_for_dropdown()
        yield (
            "Pipeline Failed",
            log_stream.getvalue(),
            pd.DataFrame({"Error": [str(e)]}),
            dropdown_choices,
        )


def run_recommendation_interface(source_url: str, min_depth: int, max_depth: int):
    placeholder_message = "Run training first to generate artifacts."
    if source_url == placeholder_message or source_url.startswith("Error:"):
        return pd.DataFrame(
            {
                "Error": [
                    "Please train the model first and select a valid URL from the dropdown. Current selection is a placeholder or error message."
                ]
            },
            columns=["Error"],
        ), f"Error: Selected source URL is a placeholder or invalid: '{source_url}'"

    if not source_url:
        return None, "Please select a source URL from the dropdown."
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(
        log_stream, logger_name="RecommendationEngineLogger"
    )
    config = LinkPredictionConfig()
    url_processor = URLProcessor()
    engine = RecommendationEngine(config, logger, url_processor)

    if min_depth is None:
        min_depth = 0
    if max_depth is None:
        max_depth = 100
    if min_depth > max_depth:
        return pd.DataFrame(
            {
                "Error": [
                    "Minimum folder depth cannot be greater than maximum folder depth."
                ]
            },
            columns=["Error"],
        ), "Error: Minimum folder depth cannot be greater than maximum folder depth."

    recommendations_df, error_msg = engine.get_recommendations(
        source_url, top_n=20, min_folder_depth=min_depth, max_folder_depth=max_depth
    )
    if error_msg:
        logger.error(error_msg)
        return pd.DataFrame(
            {"Error": [error_msg]}, columns=["Error"]
        ), log_stream.getvalue()

    if recommendations_df is None or recommendations_df.empty:
        logger.info("No recommendations found matching the specified filters.")
        return pd.DataFrame(
            {
                "Message": [
                    "No recommendations found matching the specified filters. Try adjusting your depth range."
                ]
            },
            columns=["Message"],
        ), log_stream.getvalue()

    return recommendations_df, log_stream.getvalue()

In [None]:
# File: notebooks/link_prediction_ui.ipynb - Cell 3

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📈 GNN Link Prediction & Recommendation Engine")
    gr.Markdown(
        "First, use the 'Train Model' tab to process your data. Then, use the 'Get Link Recommendations' tab to get predictions for new, non-existent links."
    )

    with gr.Tabs():
        with gr.TabItem("Train Model"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("## 1. Configuration")
                    with gr.Accordion("Data Paths", open=True):
                        edge_csv_path_input = gr.Textbox(
                            label="Edge List CSV Path",
                            value=LinkPredictionConfig.edge_csv_path,
                        )
                        embeddings_dir_path_input = gr.Textbox(
                            label="Embeddings Directory Path",
                            value=LinkPredictionConfig.embeddings_dir_path,
                        )
                    with gr.Accordion("Model Hyperparameters", open=True):
                        hidden_channels_input = gr.Number(
                            label="Hidden Channels",
                            value=LinkPredictionConfig.hidden_channels,
                        )
                        out_channels_input = gr.Number(
                            label="Output Embedding Size",
                            value=LinkPredictionConfig.out_channels,
                        )
                    with gr.Accordion("Training Parameters", open=True):
                        learning_rate_input = gr.Number(
                            label="Learning Rate",
                            value=LinkPredictionConfig.learning_rate,
                        )
                        epochs_input = gr.Number(
                            label="Training Epochs", value=LinkPredictionConfig.epochs
                        )
                    start_button = gr.Button(
                        "Train Link Prediction Model", variant="primary"
                    )
                with gr.Column(scale=2):
                    gr.Markdown("## 2. Training Status")
                    train_status_output = gr.Textbox(
                        label="Current Status", interactive=False
                    )
                    train_log_output = gr.Textbox(
                        label="Pipeline Logs", interactive=False, lines=15
                    )
                    train_results_output = gr.DataFrame(
                        label="Training Completion Status"
                    )

        with gr.TabItem("Get Link Recommendations"):
            gr.Markdown("## 1. Select a Source Page & Filters")
            gr.Markdown(
                "Choose a URL and the model will recommend top pages it should link to. (You must train the model on the tab to the left first)."
            )
            with gr.Row():
                source_url_dropdown = gr.Dropdown(
                    label="Source URL",
                    choices=get_all_nodes_for_dropdown(),
                    interactive=True,
                )
            with gr.Row():
                min_folder_depth_input = gr.Number(
                    label="Minimum Folder Depth", value=0, precision=0
                )
                max_folder_depth_input = gr.Number(
                    label="Maximum Folder Depth", value=100, precision=0
                )

            recommend_button = gr.Button("Get Recommendations", variant="primary")
            gr.Markdown("## 2. Results: High-Potential Missing Links")
            recommend_results_output = gr.DataFrame(
                label="Top Link Recommendations",
                headers=["RECOMMENDED_URL", "SCORE", "FOLDER_DEPTH"],
            )
            recommend_log_output = gr.Textbox(label="Logs", interactive=False, lines=4)

    start_button.click(
        fn=run_training_pipeline,
        inputs=[
            edge_csv_path_input,
            embeddings_dir_path_input,
            hidden_channels_input,
            out_channels_input,
            learning_rate_input,
            epochs_input,
        ],
        outputs=[
            train_status_output,
            train_log_output,
            train_results_output,
            source_url_dropdown,
        ],
    )

    recommend_button.click(
        fn=run_recommendation_interface,
        inputs=[source_url_dropdown, min_folder_depth_input, max_folder_depth_input],
        outputs=[recommend_results_output, recommend_log_output],
    )

In [None]:
# File: notebooks/link_prediction_ui.ipynb - Cell 4

if __name__ == "__main__":
    try:
        from google.colab import drive

        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print(f"Could not launch Gradio demo in this environment: {e}")