In [None]:
!pip install google-auth -Uq


In [None]:
!pip install --upgrade google-api-python-client -Uq


In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib google-auth -Uq


In [20]:
from __future__ import annotations

import os
import time
import random
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterable, List, Optional, Tuple, Protocol

# Google deps:
#   pip install google-api-python-client google-auth google-auth-oauthlib
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google.oauth2 import service_account
from google.auth.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request


# -----------------------------
# Types + small abstractions
# -----------------------------

SCOPES = (
    # "https://www.googleapis.com/auth/webmasters.readonly",
    "https://www.googleapis.com/auth/webmasters"
)

SearchRow = Dict[str, Any]
JsonDict = Dict[str, Any]


class SearchConsoleService(Protocol):
    """Protocol to allow dependency injection/mocking in tests."""

    def searchanalytics(self) -> Any: ...


@dataclass(frozen=True)
class GSCQueryConfig:
    property_url: str
    window_days: int = 28
    search_type: str = "web"  # "web", "image", "video", "news", "discover", "googleNews"
    row_limit: int = 25000
    max_rows_per_report: int = 50000  # practical safety ceiling per segment/report
    sleep_between_calls_s: float = 0.2
    max_retries: int = 5


@dataclass(frozen=True)
class SegmentReport:
    """Normalized output for one segment (appearance) and one entity type (page/query)."""
    search_appearance: str
    rows: List[SearchRow]


# -----------------------------
# Analyze → Decompose
# -----------------------------

def authenticate_gsc(
    *,
    auth_mode: str = "service_account",
    service_account_json_path: Optional[str] = None,
    oauth_client_secrets_path: Optional[str] = None,
    oauth_token_path: Optional[str] = None,
) -> Credentials:
    """
    Authenticate to the Search Console API.

    Supported modes:
      - service_account: uses a service account JSON key file
      - oauth: interactive OAuth flow, stores refresh token

    Notes:
      - For service accounts, the GSC property must be shared with the service account email.
      - For OAuth, this is suitable for local/dev; production typically uses stored tokens.
    """
    auth_mode = auth_mode.strip().lower()

    if auth_mode == "service_account":
        if not service_account_json_path:
            service_account_json_path = os.getenv("GSC_SERVICE_ACCOUNT_JSON")

        if not service_account_json_path:
            raise ValueError("service_account_json_path is required (or set GSC_SERVICE_ACCOUNT_JSON).")

        creds = service_account.Credentials.from_service_account_file(
            service_account_json_path,
            scopes=list(SCOPES),
        )
        return creds

    if auth_mode == "oauth":
        if not oauth_client_secrets_path:
            oauth_client_secrets_path = os.getenv("GSC_OAUTH_CLIENT_SECRETS")

        if not oauth_client_secrets_path:
            raise ValueError("oauth_client_secrets_path is required (or set GSC_OAUTH_CLIENT_SECRETS).")

        if not oauth_token_path:
            oauth_token_path = os.getenv("GSC_OAUTH_TOKEN_PATH", "gsc_oauth_token.json")

        creds: Optional[Credentials] = None
        if os.path.exists(oauth_token_path):
            from google.oauth2.credentials import Credentials as UserCredentials  # type: ignore
            creds = UserCredentials.from_authorized_user_file(oauth_token_path, scopes=list(SCOPES))

        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        elif not creds or not creds.valid:
            flow = InstalledAppFlow.from_client_secrets_file(oauth_client_secrets_path, scopes=list(SCOPES))
            creds = flow.run_local_server(port=0)

            with open(oauth_token_path, "w", encoding="utf-8") as f:
                f.write(creds.to_json())

        assert creds is not None
        return creds

    raise ValueError(f"Unsupported auth_mode={auth_mode!r}. Use 'service_account' or 'oauth'.")


def build_gsc_service(creds: Credentials) -> Any:
    """
    Build the googleapiclient Search Console service.
    """
    # discovery cache disabled to avoid file system surprises in some runtimes
    return build("searchconsole", "v1", credentials=creds, cache_discovery=False)


def _iso_date(d: datetime) -> str:
    return d.strftime("%Y-%m-%d")


def _now_iso8601_utc() -> str:
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()


def _date_window(window_days: int) -> Tuple[str, str]:
    """
    Returns (startDate, endDate) in YYYY-MM-DD.
    GSC Search Analytics expects dates in PT in docs, but API accepts date strings; the service handles it.
    We use an inclusive window ending "yesterday" to avoid partial same-day reporting volatility.
    """
    today_utc = datetime.now(timezone.utc).date()
    end = today_utc - timedelta(days=1)
    start = end - timedelta(days=window_days - 1)
    return start.isoformat(), end.isoformat()


# -----------------------------
# Implement → Query
# -----------------------------

def query_search_analytics(
    service: Any,
    *,
    site_url: str,
    start_date: str,
    end_date: str,
    dimensions: List[str],
    search_type: str = "web",
    dimension_filter_groups: Optional[List[Dict[str, Any]]] = None,
    row_limit: int = 25000,
    max_rows: int = 50000,
    sleep_between_calls_s: float = 0.2,
    max_retries: int = 5,
) -> List[SearchRow]:
    """
    Query Search Analytics with pagination via startRow.

    Returns the concatenated list of rows.
    """
    rows: List[SearchRow] = []
    start_row = 0

    while True:
        body: Dict[str, Any] = {
            "startDate": start_date,
            "endDate": end_date,
            "dimensions": dimensions,
            "type": search_type,
            "rowLimit": min(row_limit, max(1, max_rows - len(rows))),
            "startRow": start_row,
        }
        if dimension_filter_groups:
            body["dimensionFilterGroups"] = dimension_filter_groups

        batch = _execute_with_backoff(
            lambda: service.searchanalytics().query(siteUrl=site_url, body=body).execute(),
            max_retries=max_retries,
        )

        batch_rows = batch.get("rows", []) or []
        rows.extend(batch_rows)

        if sleep_between_calls_s > 0:
            time.sleep(sleep_between_calls_s)

        # Stop conditions
        if not batch_rows:
            break
        if len(batch_rows) < body["rowLimit"]:
            break
        if len(rows) >= max_rows:
            break

        start_row += body["rowLimit"]

    return rows


def _execute_with_backoff(fn, *, max_retries: int = 5):
    """
    Exponential backoff for transient HTTP errors and quota pressure.
    """
    for attempt in range(max_retries + 1):
        try:
            return fn()
        except HttpError as e:
            status = getattr(e.resp, "status", None)
            # Retry 429/5xx + some 403 quota cases
            retryable = status in (429, 500, 502, 503, 504, 403)
            if not retryable or attempt == max_retries:
                raise

            # jittered exponential backoff
            sleep_s = min(2 ** attempt, 32) + random.random()
            time.sleep(sleep_s)


# -----------------------------
# Implement → Filter by searchAppearance
# -----------------------------

def filter_by_search_appearance(rows: List[SearchRow]) -> List[str]:
    """
    Extract searchAppearance keys from rows grouped by ['searchAppearance'].
    """
    appearances: List[str] = []
    for r in rows:
        keys = r.get("keys", [])
        if not keys:
            continue
        # When grouped only by searchAppearance, keys[0] is the appearance label
        appearance = str(keys[0]).strip()
        if appearance and appearance not in appearances:
            appearances.append(appearance)
    return appearances


def _appearance_filter_group(appearance: str) -> List[Dict[str, Any]]:
    """
    Build a filter group that selects a single searchAppearance value.
    Using equals avoids weird edge cases with notEquals/notContains.
    """
    return [{
        "groupType": "and",
        "filters": [{
            "dimension": "searchAppearance",
            "operator": "equals",
            "expression": appearance,
        }],
    }]


# -----------------------------
# Normalize → Output shaping
# -----------------------------

def normalize_gsc_data(
    *,
    appearance: str,
    rows: List[SearchRow],
    key_name: str,
) -> List[JsonDict]:
    """
    Normalize GSC API rows into stable dicts.

    key_name: "page" or "query"
    """
    normalized: List[JsonDict] = []
    for r in rows:
        keys = r.get("keys", []) or []
        if not keys:
            continue

        # When filtered by appearance and grouped by [page] or [query], keys[0] is the value.
        value = str(keys[0])

        clicks = float(r.get("clicks", 0.0) or 0.0)
        impressions = float(r.get("impressions", 0.0) or 0.0)
        ctr = float(r.get("ctr", (clicks / impressions) if impressions else 0.0) or 0.0)
        position = float(r.get("position", 0.0) or 0.0)

        normalized.append({
            "searchAppearance": appearance,
            key_name: value,
            "clicks": clicks,
            "impressions": impressions,
            "ctr": ctr,
            "position": position,
        })

    return normalized


def build_state_patch(
    *,
    window_days: int,
    pages: List[JsonDict],
    queries: List[JsonDict],
) -> JsonDict:
    """
    Build the required patch structure exactly.
    """
    return {
        "inputs": {
            "gsc": {
                "collected_at": _now_iso8601_utc(),
                "window_days": int(window_days),
                "pages": pages,
                "queries": queries,
                # Not available under the "Search Analytics only" constraint:
                "index_coverage": [],
                "sitemap_status": [],
            }
        }
    }


# -----------------------------
# Return → Node
# -----------------------------

def collect_gsc(state: JsonDict, *, auth_mode: str = "service_account") -> JsonDict:
    """
    Node: collect_gsc

    READS:
      - run.domain
      - config.integrations.gsc_property_url

    WRITES:
      - inputs.gsc

    Behavior:
      1) discovers top searchAppearance segments
      2) for each segment, pulls top pages and queries for the same date window
      3) normalizes output into a state patch
    """
    domain = _get_path(state, "run.domain")
    property_url = _get_path(state, "config.integrations.gsc_property_url")

    if not property_url or not isinstance(property_url, str):
        raise ValueError("Missing config.integrations.gsc_property_url (expected string).")

    # Optional: allow a property_url template like "sc-domain:{domain}"
    if isinstance(domain, str) and "{domain}" in property_url:
        clean_domain = domain.removeprefix("sc-domain:").replace("https://", "").replace("http://", "").split("/")[0]
        property_url = property_url.format(domain=clean_domain)


    cfg = GSCQueryConfig(property_url=property_url)

    creds = authenticate_gsc(
        auth_mode="oauth",
        oauth_client_secrets_path=r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\client_secret_733345724329-vsr9dtaic0aq0e97gbv2eqmegi15tq8t.apps.googleusercontent.com.json"
    )

    service = build_gsc_service(creds)

    start_date, end_date = _date_window(cfg.window_days)

    # 1) Discover appearance segments (top appearances)
    appearance_rows = query_search_analytics(
        service,
        site_url=cfg.property_url,
        start_date=start_date,
        end_date=end_date,
        dimensions=["searchAppearance"],
        search_type=cfg.search_type,
        row_limit=min(cfg.row_limit, 5000),  # discovery doesn't need max page size
        max_rows=5000,
        sleep_between_calls_s=cfg.sleep_between_calls_s,
        max_retries=cfg.max_retries,
    )
    appearances = filter_by_search_appearance(appearance_rows)

    # If GSC returns nothing (small/new sites), keep outputs empty but valid.
    all_pages: List[JsonDict] = []
    all_queries: List[JsonDict] = []

    # 2) Pull pages + queries per appearance
    for appearance in appearances:
        filters = _appearance_filter_group(appearance)

        # Pages report (filtered by appearance)
        page_rows = query_search_analytics(
            service,
            site_url=cfg.property_url,
            start_date=start_date,
            end_date=end_date,
            dimensions=["page"],
            search_type=cfg.search_type,
            dimension_filter_groups=filters,
            row_limit=cfg.row_limit,
            max_rows=cfg.max_rows_per_report,
            sleep_between_calls_s=cfg.sleep_between_calls_s,
            max_retries=cfg.max_retries,
        )
        all_pages.extend(normalize_gsc_data(appearance=appearance, rows=page_rows, key_name="page"))

        # Queries report (filtered by appearance)
        query_rows = query_search_analytics(
            service,
            site_url=cfg.property_url,
            start_date=start_date,
            end_date=end_date,
            dimensions=["query"],
            search_type=cfg.search_type,
            dimension_filter_groups=filters,
            row_limit=cfg.row_limit,
            max_rows=cfg.max_rows_per_report,
            sleep_between_calls_s=cfg.sleep_between_calls_s,
            max_retries=cfg.max_retries,
        )
        all_queries.extend(normalize_gsc_data(appearance=appearance, rows=query_rows, key_name="query"))

    # 3) Build patch (exact shape)
    return build_state_patch(
        window_days=cfg.window_days,
        pages=all_pages,
        queries=all_queries,
    )


# -----------------------------
# Utilities
# -----------------------------

def _get_path(obj: Dict[str, Any], path: str) -> Any:
    """
    Safe nested dict getter: "a.b.c"
    """
    cur: Any = obj
    for part in path.split("."):
        if not isinstance(cur, dict) or part not in cur:
            return None
        cur = cur[part]
    return cur


In [21]:
state = {
    "run": {"domain": "napkin.ai"},
    "config": {
        "integrations": {
            # "gsc_property_url": "https://napkin.ai/"
            "gsc_property_url": "sc-domain:napkin.ai"
        }
    }
}


result = collect_gsc(state, auth_mode="oauth")
print(result)



HttpError: <HttpError 403 when requesting https://searchconsole.googleapis.com/webmasters/v3/sites/sc-domain%3Anapkin.ai/searchAnalytics/query?alt=json returned "User does not have sufficient permission for site 'sc-domain:napkin.ai'. See also: https://support.google.com/webmasters/answer/2451999.". Details: "[{'message': "User does not have sufficient permission for site 'sc-domain:napkin.ai'. See also: https://support.google.com/webmasters/answer/2451999.", 'domain': 'global', 'reason': 'forbidden'}]">

In [None]:
state["config"]["integrations"]["gsc_property_url"] = "https://napkin.ai/"


result = collect_gsc(state, auth_mode="oauth")
print(result)


In [10]:
!del gsc_oauth_token.json


In [32]:
# Use the OAuth credentials and service already authenticated in the notebook
SITE_URL = ['https://www.paddleaurum.com/']

# Define query parameters
request = {
    'startDate': '2023-10-01',
    'endDate': '2023-10-31',
    'dimensions': ['query', 'page'],
    'rowLimit': 100
}

# Authenticate and build service
creds = authenticate_gsc(
    auth_mode="oauth",
    oauth_client_secrets_path=r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\client_secret_733345724329-vsr9dtaic0aq0e97gbv2eqmegi15tq8t.apps.googleusercontent.com.json",
    oauth_token_path="gsc_oauth_token.json"
)
service = build_gsc_service(creds)

# Execute query using the service already built in Cell 3
response = service.searchanalytics().query(siteUrl=SITE_URL, body=request).execute()

# Print results
for row in response.get('rows', []):
    print(f"Query: {row['keys'][0]}, Clicks: {row['clicks']}, Impressions: {row['impressions']}")


RefreshError: ('invalid_scope: Some requested scopes were invalid. {invalid=[a, b, c, e, g, h, i, l, m, ., /, o, p, r, s, t, u, w, :]}', {'error': 'invalid_scope', 'error_description': 'Some requested scopes were invalid. {invalid=[a, b, c, e, g, h, i, l, m, ., /, o, p, r, s, t, u, w, :]}', 'error_uri': 'https://developers.google.com/identity/protocols/oauth2'})

In [35]:
def collect_gsc(state: JsonDict, *, auth_mode: str = "service_account") -> JsonDict:
    """
    Node: collect_gsc

    READS:
      - run.domain
      - config.integrations.gsc_property_url

    WRITES:
      - inputs.gsc

    Behavior:
      1) discovers top searchAppearance segments
      2) for each segment, pulls top pages and queries for the same date window
      3) normalizes output into a state patch
    """
    domain = _get_path(state, "run.domain")
    property_url = _get_path(state, "config.integrations.gsc_property_url")

    if not property_url or not isinstance(property_url, str):
        raise ValueError("Missing config.integrations.gsc_property_url (expected string).")

    # Optional: allow a property_url template like "sc-domain:{domain}"
    if isinstance(domain, str) and "{domain}" in property_url:
        clean_domain = domain.removeprefix("sc-domain:").replace("https://", "").replace("http://", "").split("/")[0]
        property_url = property_url.format(domain=clean_domain)

    cfg = GSCQueryConfig(property_url=property_url)

    # Get OAuth secrets path from environment or use default
    oauth_secrets = os.getenv(
        "GSC_OAUTH_CLIENT_SECRETS",
        r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\client_secret_733345724329-vsr9dtaic0aq0e97gbv2eqmegi15tq8t.apps.googleusercontent.com.json"
    )

    creds = authenticate_gsc(
        auth_mode="oauth",
        oauth_client_secrets_path=oauth_secrets
    )

    service = build_gsc_service(creds)
    start_date, end_date = _date_window(cfg.window_days)

    # 1) Discover appearance segments (top appearances)
    appearance_rows = query_search_analytics(
        service,
        site_url=cfg.property_url,
        start_date=start_date,
        end_date=end_date,
        dimensions=["searchAppearance"],
        search_type=cfg.search_type,
        row_limit=min(cfg.row_limit, 5000),
        max_rows=5000,
        sleep_between_calls_s=cfg.sleep_between_calls_s,
        max_retries=cfg.max_retries,
    )
    appearances = filter_by_search_appearance(appearance_rows)

    # If GSC returns nothing (small/new sites), keep outputs empty but valid.
    all_pages: List[JsonDict] = []
    all_queries: List[JsonDict] = []

    # 2) Pull pages + queries per appearance
    for appearance in appearances:
        filters = _appearance_filter_group(appearance)

        # Pages report (filtered by appearance)
        page_rows = query_search_analytics(
            service,
            site_url=cfg.property_url,
            start_date=start_date,
            end_date=end_date,
            dimensions=["page"],
            search_type=cfg.search_type,
            dimension_filter_groups=filters,
            row_limit=cfg.row_limit,
            max_rows=cfg.max_rows_per_report,
            sleep_between_calls_s=cfg.sleep_between_calls_s,
            max_retries=cfg.max_retries,
        )
        all_pages.extend(normalize_gsc_data(appearance=appearance, rows=page_rows, key_name="page"))

        # Queries report (filtered by appearance)
        query_rows = query_search_analytics(
            service,
            site_url=cfg.property_url,
            start_date=start_date,
            end_date=end_date,
            dimensions=["query"],
            search_type=cfg.search_type,
            dimension_filter_groups=filters,
            row_limit=cfg.row_limit,
            max_rows=cfg.max_rows_per_report,
            sleep_between_calls_s=cfg.sleep_between_calls_s,
            max_retries=cfg.max_retries,
        )
        all_queries.extend(normalize_gsc_data(appearance=appearance, rows=query_rows, key_name="query"))

    # 3) Build patch (exact shape)
    return build_state_patch(
        window_days=cfg.window_days,
        pages=all_pages,
        queries=all_queries,
    )


In [36]:
result = collect_gsc(state, auth_mode="oauth")
print(result)


RefreshError: ('invalid_scope: Some requested scopes were invalid. {invalid=[a, b, c, e, g, h, i, l, m, ., /, o, p, r, s, t, u, w, :]}', {'error': 'invalid_scope', 'error_description': 'Some requested scopes were invalid. {invalid=[a, b, c, e, g, h, i, l, m, ., /, o, p, r, s, t, u, w, :]}', 'error_uri': 'https://developers.google.com/identity/protocols/oauth2'})

In [71]:
"""
Google Search Console Data Collection Node
Author: Senior Python Engineer with SEO Tooling Expertise
Description: Collects GSC Search Analytics data segmented by searchAppearance to infer structured data performance.
"""

import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict
import json

# Google API imports
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
DEFAULT_WINDOW_DAYS = 28
MAX_ROWS_PER_QUERY = 25000  # API limit per request [citation:9]
SEARCH_TYPES = ["web", "news", "video", "image", "discover", "googleNews"]
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]

# Mapping from searchAppearance to structured data types [citation:3]
SEARCH_APPEARANCE_TO_STRUCTURED_TYPE = {
    # Rich Results with GSC reporting
    "AMP Top Story": "article",
    "Education Q&A": "education_qa",
    "FAQ": "faq",
    "Job Listing": "job_posting",
    "Job Details": "job_posting",
    "Merchant Listing": "product",
    "Product Snippet": "product",
    "Q&A": "qa_page",
    "Review Snippet": "review",
    "Recipe Gallery": "recipe",
    "Video": "video",
    # Non-rich result appearances
    "AMP non-rich result": "amp",
    "Android App": "android_app",
    "Media Actions": "media_actions",
    "Translated Result": "translated",
    "Web Light": "web_light"
}

@dataclass
class PageMetrics:
    """Normalized page performance metrics"""
    url: str
    clicks: int
    impressions: int
    ctr: float
    position: float
    search_appearances: List[str]
    structured_data_types: List[str]

@dataclass
class QueryMetrics:
    """Normalized query performance metrics"""
    query: str
    clicks: int
    impressions: int
    ctr: float
    position: float
    search_appearances: List[str]
    structured_data_types: List[str]

@dataclass
class SitemapStatus:
    """Sitemap submission status"""
    path: str
    last_submitted: str
    is_pending: bool
    errors: List[str]

class GSCClient:
    """Google Search Console API client with service account authentication"""
    
    def __init__(self, credentials_path: str, property_url: str):
        """
        Initialize GSC client with service account credentials.
        
        Args:
            credentials_path: Path to service account JSON key file
            property_url: GSC property URL (e.g., 'https://www.example.com/')
        """
        self.credentials_path = credentials_path
        self.property_url = property_url
        self.service = None
        
    def authenticate(self) -> None:
        """
        Authenticate using service account credentials and build GSC service.
        
        Raises:
            FileNotFoundError: If credentials file doesn't exist
            ValueError: If authentication fails
        """
        try:
            credentials = service_account.Credentials.from_service_account_file(
                self.credentials_path, scopes=SCOPES
            )
            self.service = build('searchconsole', 'v1', credentials=credentials)
            logger.info(f"Authenticated to GSC for property: {self.property_url}")
        except FileNotFoundError as e:
            logger.error(f"Credentials file not found: {self.credentials_path}")
            raise
        except Exception as e:
            logger.error(f"Authentication failed: {str(e)}")
            raise ValueError(f"GSC authentication failed: {str(e)}")
    
    def query_search_analytics(
        self,
        start_date: str,
        end_date: str,
        dimensions: List[str],
        row_limit: int = MAX_ROWS_PER_QUERY
    ) -> List[Dict]:
        """
        Query Search Analytics API for specified dimensions and date range.
        
        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            dimensions: List of dimensions to group by
            row_limit: Maximum rows to return (max 25000)
            
        Returns:
            List of row data from API
            
        Raises:
            HttpError: If API request fails
        """
        if not self.service:
            raise RuntimeError("GSC service not initialized. Call authenticate() first.")
        
        # Validate dimensions include searchAppearance for structured data inference
        if "searchAppearance" not in dimensions:
            logger.warning("searchAppearance dimension not included - structured data inference limited")
        
        request_body = {
            "startDate": start_date,
            "endDate": end_date,
            "dimensions": dimensions,
            "rowLimit": min(row_limit, MAX_ROWS_PER_QUERY),
            "dimensionFilterGroups": [{
                "groupType": "and",
                "filters": []
            }]
        }
        
        try:
            logger.info(f"Querying GSC data from {start_date} to {end_date} with dimensions: {dimensions}")
            response = self.service.searchanalytics().query(
                siteUrl=self.property_url,
                body=request_body
            ).execute()
            
            rows = response.get('rows', [])
            logger.info(f"Retrieved {len(rows)} rows from GSC API")
            return rows
            
        except HttpError as e:
            logger.error(f"GSC API query failed: {str(e)}")
            raise
    
    def fetch_sitemap_status(self) -> List[SitemapStatus]:
        """
        Fetch sitemap submission status for the property.
        
        Returns:
            List of sitemap status objects
            
        Raises:
            HttpError: If API request fails
        """
        if not self.service:
            raise RuntimeError("GSC service not initialized. Call authenticate() first.")
        
        try:
            response = self.service.sitemaps().list(siteUrl=self.property_url).execute()
            sitemaps = response.get('sitemap', [])
            
            status_list = []
            for sitemap in sitemaps:
                status = SitemapStatus(
                    path=sitemap.get('path', ''),
                    last_submitted=sitemap.get('lastSubmitted', ''),
                    is_pending=sitemap.get('isPending', False),
                    errors=sitemap.get('errors', [])
                )
                status_list.append(status)
            
            logger.info(f"Retrieved {len(status_list)} sitemaps")
            return status_list
            
        except HttpError as e:
            logger.error(f"Failed to fetch sitemaps: {str(e)}")
            return []

def filter_by_search_appearance(rows: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """
    Separate rows with and without searchAppearance data.
    
    Args:
        rows: Raw rows from GSC API
        
    Returns:
        Tuple of (rows_with_appearance, rows_without_appearance)
    """
    with_appearance = []
    without_appearance = []
    
    for row in rows:
        keys = row.get('keys', [])
        # Check if searchAppearance dimension exists (it's the third dimension in our query)
        if len(keys) > 2 and keys[2]:  # searchAppearance is at index 2 when dimensions = ['page', 'query', 'searchAppearance']
            with_appearance.append(row)
        else:
            without_appearance.append(row)
    
    logger.info(f"Filtered {len(with_appearance)} rows with searchAppearance, "
                f"{len(without_appearance)} rows without")
    return with_appearance, without_appearance

def normalize_gsc_data(
    rows_with_appearance: List[Dict],
    rows_without_appearance: List[Dict]
) -> Tuple[List[PageMetrics], List[QueryMetrics]]:
    """
    Normalize and aggregate GSC data by page and query.
    
    Args:
        rows_with_appearance: Rows containing searchAppearance data
        rows_without_appearance: Rows without searchAppearance data
        
    Returns:
        Tuple of (normalized_pages, normalized_queries)
    """
    # Aggregate data by page and query
    page_aggregates: Dict[str, Dict] = {}
    query_aggregates: Dict[str, Dict] = {}
    
    # Process rows with searchAppearance
    for row in rows_with_appearance:
        keys = row.get('keys', [])
        if len(keys) < 3:
            continue
            
        page, query, appearance = keys[0], keys[1], keys[2]
        
        # Aggregate page data
        if page not in page_aggregates:
            page_aggregates[page] = {
                'clicks': 0,
                'impressions': 0,
                'ctr_sum': 0.0,
                'position_sum': 0.0,
                'count': 0,
                'search_appearances': set(),
                'structured_data_types': set()
            }
        
        page_data = page_aggregates[page]
        page_data['clicks'] += row.get('clicks', 0)
        page_data['impressions'] += row.get('impressions', 0)
        page_data['ctr_sum'] += row.get('ctr', 0.0)
        page_data['position_sum'] += row.get('position', 0.0)
        page_data['count'] += 1
        page_data['search_appearances'].add(appearance)
        
        # Map searchAppearance to structured data type
        structured_type = SEARCH_APPEARANCE_TO_STRUCTURED_TYPE.get(appearance)
        if structured_type:
            page_data['structured_data_types'].add(structured_type)
        
        # Aggregate query data (if query exists)
        if query:
            if query not in query_aggregates:
                query_aggregates[query] = {
                    'clicks': 0,
                    'impressions': 0,
                    'ctr_sum': 0.0,
                    'position_sum': 0.0,
                    'count': 0,
                    'search_appearances': set(),
                    'structured_data_types': set()
                }
            
            query_data = query_aggregates[query]
            query_data['clicks'] += row.get('clicks', 0)
            query_data['impressions'] += row.get('impressions', 0)
            query_data['ctr_sum'] += row.get('ctr', 0.0)
            query_data['position_sum'] += row.get('position', 0.0)
            query_data['count'] += 1
            query_data['search_appearances'].add(appearance)
            
            if structured_type:
                query_data['structured_data_types'].add(structured_type)
    
    # Process rows without searchAppearance (default web search)
    for row in rows_without_appearance:
        keys = row.get('keys', [])
        if not keys:
            continue
            
        page = keys[0] if len(keys) > 0 else ''
        query = keys[1] if len(keys) > 1 else ''
        
        if page:
            if page not in page_aggregates:
                page_aggregates[page] = {
                    'clicks': 0,
                    'impressions': 0,
                    'ctr_sum': 0.0,
                    'position_sum': 0.0,
                    'count': 0,
                    'search_appearances': set(),
                    'structured_data_types': set()
                }
            
            page_data = page_aggregates[page]
            page_data['clicks'] += row.get('clicks', 0)
            page_data['impressions'] += row.get('impressions', 0)
            page_data['ctr_sum'] += row.get('ctr', 0.0)
            page_data['position_sum'] += row.get('position', 0.0)
            page_data['count'] += 1
        
        if query:
            if query not in query_aggregates:
                query_aggregates[query] = {
                    'clicks': 0,
                    'impressions': 0,
                    'ctr_sum': 0.0,
                    'position_sum': 0.0,
                    'count': 0,
                    'search_appearances': set(),
                    'structured_data_types': set()
                }
            
            query_data = query_aggregates[query]
            query_data['clicks'] += row.get('clicks', 0)
            query_data['impressions'] += row.get('impressions', 0)
            query_data['ctr_sum'] += row.get('ctr', 0.0)
            query_data['position_sum'] += row.get('position', 0.0)
            query_data['count'] += 1
    
    # Convert aggregates to normalized objects
    normalized_pages = []
    for url, data in page_aggregates.items():
        if data['count'] > 0:
            page = PageMetrics(
                url=url,
                clicks=data['clicks'],
                impressions=data['impressions'],
                ctr=data['ctr_sum'] / data['count'],
                position=data['position_sum'] / data['count'],
                search_appearances=list(data['search_appearances']),
                structured_data_types=list(data['structured_data_types'])
            )
            normalized_pages.append(page)
    
    normalized_queries = []
    for query_text, data in query_aggregates.items():
        if data['count'] > 0 and query_text:
            query = QueryMetrics(
                query=query_text,
                clicks=data['clicks'],
                impressions=data['impressions'],
                ctr=data['ctr_sum'] / data['count'],
                position=data['position_sum'] / data['count'],
                search_appearances=list(data['search_appearances']),
                structured_data_types=list(data['structured_data_types'])
            )
            normalized_queries.append(query)
    
    logger.info(f"Normalized {len(normalized_pages)} pages and {len(normalized_queries)} queries")
    return normalized_pages, normalized_queries

def build_state_patch(
    pages: List[PageMetrics],
    queries: List[QueryMetrics],
    sitemap_statuses: List[SitemapStatus],
    window_days: int = DEFAULT_WINDOW_DAYS
) -> Dict:
    """
    Build the final state patch in the required format.
    
    Args:
        pages: Normalized page metrics
        queries: Normalized query metrics
        sitemap_statuses: Sitemap status information
        window_days: Number of days in the data window
        
    Returns:
        State patch dictionary
    """
    # Convert dataclasses to dictionaries
    pages_dict = [asdict(p) for p in pages]
    queries_dict = [asdict(q) for q in queries]
    sitemap_dict = [asdict(s) for s in sitemap_statuses]
    
    # Build the state patch
    state_patch = {
        "inputs": {
            "gsc": {
                "collected_at": datetime.utcnow().isoformat() + "Z",
                "window_days": window_days,
                "pages": pages_dict,
                "queries": queries_dict,
                "index_coverage": [],  # Not available via GSC API
                "sitemap_status": sitemap_dict
            }
        }
    }
    
    return state_patch

def collect_gsc(state: Dict) -> Dict:
    """
    Main node function to collect GSC data.
    
    Reads:
        - state['run']['domain']
        - state['config']['integrations']['gsc_property_url']
        - state['config']['integrations']['gsc_credentials_path']
    
    Writes:
        - state['inputs']['gsc']
    
    Args:
        state: Current pipeline state
        
    Returns:
        Updated state with GSC data
    """
    logger.info("Starting GSC data collection")
    
    # Extract configuration from state
    domain = state.get('run', {}).get('domain', '')
    property_url = state.get('config', {}).get('integrations', {}).get('gsc_property_url', '')
    credentials_path = state.get('config', {}).get('integrations', {}).get('gsc_credentials_path', '')
    
    # Validate configuration
    if not property_url:
        property_url = f"https://{domain}/" if domain else ''
    
    if not property_url:
        raise ValueError("GSC property URL not configured. Set config.integrations.gsc_property_url")
    
    if not credentials_path:
        raise ValueError("GSC credentials path not configured. Set config.integrations.gsc_credentials_path")
    
    # Calculate date range (last 28 days) [citation:1]
    end_date = datetime.utcnow().date()
    start_date = end_date - timedelta(days=DEFAULT_WINDOW_DAYS)
    
    # Initialize and authenticate GSC client
    gsc_client = GSCClient(credentials_path, property_url)
    gsc_client.authenticate()
    
    # Query search analytics data with key dimensions [citation:1][citation:9]
    # dimensions = ["page", "query", "searchAppearance"]
    
    # 1) Core performance data
    perf_rows = gsc_client.query_search_analytics(
        start_date=start_date.isoformat(),
        end_date=end_date.isoformat(),
        dimensions=["page", "query"],
        row_limit=50000
    )

    # 2) Search appearance data (standalone)
    appearance_rows = gsc_client.query_search_analytics(
        start_date=start_date.isoformat(),
        end_date=end_date.isoformat(),
        dimensions=["searchAppearance"],
        row_limit=50000
    )


    rows = gsc_client.query_search_analytics(
        start_date=start_date.isoformat(),
        end_date=end_date.isoformat(),
        dimensions=["perf_rows", "appearance_rows"],
    )
    
    # Filter rows by searchAppearance presence
    rows_with_appearance, rows_without_appearance = filter_by_search_appearance(rows)
    
    # Normalize and aggregate data
    normalized_pages, normalized_queries = normalize_gsc_data(
        rows_with_appearance,
        rows_without_appearance
    )
    
    # Fetch sitemap status [citation:2]
    sitemap_statuses = gsc_client.fetch_sitemap_status()
    
    # Build final state patch
    state_patch = build_state_patch(
        pages=normalized_pages,
        queries=normalized_queries,
        sitemap_statuses=sitemap_statuses,
        window_days=DEFAULT_WINDOW_DAYS
    )
    
    logger.info("GSC data collection completed successfully")
    return state_patch


In [None]:
# Example usage and test
if __name__ == "__main__":
    # Example state for testing
    example_state = {
        "run": {
            "domain": "paddleaurum.com"
        },
        "config": {
            "integrations": {
                "gsc_property_url": "https://www.paddleaurum.com/",
                "gsc_credentials": str(credentials_path)  # Use the credentials_path variable defined earlier
            }
        },
        "inputs": {}
    }
    
    try:
        # Run the collection
        result_state = collect_gsc(example_state)
        
        # Print sample output
        print("GSC Data Collection Complete")
        print(f"Collected at: {result_state['inputs']['gsc']['collected_at']}")
        print(f"Pages collected: {len(result_state['inputs']['gsc']['pages'])}")
        print(f"Queries collected: {len(result_state['inputs']['gsc']['queries'])}")
        print(f"Sitemaps found: {len(result_state['inputs']['gsc']['sitemap_status'])}")
        
        # Show first page as example
        if result_state['inputs']['gsc']['pages']:
            first_page = result_state['inputs']['gsc']['pages'][0]
            print(f"\nSample page data:")
            print(f"  URL: {first_page['url']}")
            print(f"  Clicks: {first_page['clicks']}")
            print(f"  Search appearances: {first_page['search_appearances']}")
            print(f"  Inferred structured data: {first_page['structured_data_types']}")
            
    except Exception as e:
        print(f"Error during GSC collection: {str(e)}")


INFO:__main__:Starting GSC data collection


Error during GSC collection: GSC credentials path not configured. Set config.integrations.gsc_credentials_path


In [None]:
"""
Google Search Console Data Collection Node - Enhanced Version
Author: Senior Python Engineer with SEO Tooling Expertise
Description: Collects GSC Search Analytics data segmented by searchAppearance to infer structured data performance.
"""

import os
import logging
import json
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any, Union
from dataclasses import dataclass, asdict
from pathlib import Path

# Google API imports
from google.oauth2 import service_account
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
DEFAULT_WINDOW_DAYS = 28
MAX_ROWS_PER_QUERY = 25000
SEARCH_TYPES = ["web", "news", "video", "image", "discover", "googleNews"]
SCOPES = ["https://www.googleapis.com/auth/webmasters.readonly"]

# Mapping from searchAppearance to structured data types
SEARCH_APPEARANCE_TO_STRUCTURED_TYPE = {
    # Rich Results
    "AMP Top Story": "article",
    "Accelerated Mobile Page": "amp",
    "Article": "article",
    "Education Q&A": "education_qa",
    "FAQ": "faq",
    "Fact Check": "fact_check",
    "How-to": "how_to",
    "Job Listing": "job_posting",
    "Job Details": "job_posting",
    "Merchant Listing": "product",
    "Product Snippet": "product",
    "Product Results": "product",
    "Q&A": "qa_page",
    "Recipe": "recipe",
    "Review Snippet": "review",
    "Site Links": "site_links",
    "Video": "video",
    "Web Light": "web_light",
    
    # SERP Features (Non-Rich)
    "Featured Snippet": "featured_snippet",
    "People Also Ask": "people_also_ask",
    "Top Stories": "top_stories",
    "Twitter Card": "twitter_card",
    
    # Mobile/App Features
    "Android App": "android_app",
    "App": "app",
    
    # Default/Organic
    "Organic Result": "organic",
    "Rich Result": "rich_result",
    "Unknown": "unknown"
}

@dataclass
class PageMetrics:
    """Normalized page performance metrics"""
    url: str
    clicks: int
    impressions: int
    ctr: float
    position: float
    search_appearances: List[str]
    structured_data_types: List[str]
    last_seen: str  # ISO format date

@dataclass
class QueryMetrics:
    """Normalized query performance metrics"""
    query: str
    clicks: int
    impressions: int
    ctr: float
    position: float
    search_appearances: List[str]
    structured_data_types: List[str]
    last_seen: str  # ISO format date

@dataclass
class SitemapStatus:
    """Sitemap submission status"""
    path: str
    last_submitted: str
    last_downloaded: str
    is_pending: bool
    errors: List[str]
    warning_count: int
    url_count: int

@dataclass
class IndexCoverage:
    """Inferred index coverage from performance data"""
    url: str
    status: str  # inferred: 'indexed', 'not_indexed', 'unknown'
    last_crawled: str  # from last impression date
    impressions_last_28d: int
    confidence: float  # 0.0 to 1.0 based on data volume

class GSCCredentials:
    """Unified credential handler supporting multiple authentication methods"""
    
    @staticmethod
    def load_credentials(credential_source: Union[str, Dict]) -> service_account.Credentials:
        """
        Load credentials from multiple sources.
        
        Args:
            credential_source: Can be:
                - Path to service account JSON file
                - JSON string of service account credentials
                - Dictionary of service account credentials
                - Path to OAuth 2.0 credentials file
                
        Returns:
            Authenticated credentials object
            
        Raises:
            ValueError: If credentials cannot be loaded
        """
        try:
            # Case 1: Already a credentials dictionary
            if isinstance(credential_source, dict):
                return service_account.Credentials.from_service_account_info(
                    credential_source, scopes=SCOPES
                )
            
            # Case 2: Path to a JSON file
            if isinstance(credential_source, str):
                # Check if it's a file path
                if os.path.exists(credential_source):
                    logger.info(f"Loading credentials from file: {credential_source}")
                    return service_account.Credentials.from_service_account_file(
                        credential_source, scopes=SCOPES
                    )
                
                # Case 3: JSON string (common in CI/CD environments)
                try:
                    creds_dict = json.loads(credential_source)
                    required_fields = ["type", "project_id", "private_key_id", 
                                      "private_key", "client_email", "client_id"]
                    
                    if all(field in creds_dict for field in required_fields):
                        logger.info("Loading credentials from JSON string")
                        return service_account.Credentials.from_service_account_info(
                            creds_dict, scopes=SCOPES
                        )
                except json.JSONDecodeError:
                    pass
                
            raise ValueError(f"Unsupported credential source: {type(credential_source)}")
            
        except Exception as e:
            logger.error(f"Failed to load credentials: {str(e)}")
            raise ValueError(f"Credential loading failed: {str(e)}")

class GSCClient:
    """Enhanced Google Search Console API client"""
    
    def __init__(self, credentials_source: Union[str, Dict], property_url: str):
        """
        Initialize GSC client with flexible credential source.
        
        Args:
            credentials_source: Service account credentials (file path, JSON string, or dict)
            property_url: GSC property URL (e.g., 'https://www.example.com/')
        """
        self.credentials_source = credentials_source
        self.property_url = property_url.rstrip('/')
        self.service = None
        self._validate_property_url()
    
    def _validate_property_url(self) -> None:
        """Validate the GSC property URL format."""
        if not self.property_url:
            raise ValueError("Property URL cannot be empty")
        
        # Accept both domain properties and URL prefix properties
        if not (self.property_url.startswith('http://') or 
                self.property_url.startswith('https://') or
                self.property_url.startswith('sc-domain:')):
            # Assume it's a domain property
            self.property_url = f"sc-domain:{self.property_url}"
            logger.info(f"Converted to domain property: {self.property_url}")
    
    def authenticate(self) -> None:
        """
        Authenticate using flexible credential source.
        
        Raises:
            ValueError: If authentication fails
        """
        try:
            credentials = GSCCredentials.load_credentials(self.credentials_source)
            
            # Refresh if needed
            if credentials.expired and credentials.refresh_token:
                credentials.refresh(Request())
            
            self.service = build('searchconsole', 'v1', credentials=credentials)
            
            # Test authentication with a simple API call
            self._test_authentication()
            logger.info(f"✓ Authenticated to GSC for property: {self.property_url}")
            
        except Exception as e:
            logger.error(f"✗ Authentication failed: {str(e)}")
            raise ValueError(f"GSC authentication failed: {str(e)}")
    
    def _test_authentication(self) -> None:
        """Test authentication with a minimal API call."""
        try:
            # Try to list sitemaps (lightweight call)
            self.service.sitemaps().list(siteUrl=self.property_url).execute()
        except HttpError as e:
            # 403 or 401 indicate auth issues
            if e.resp.status in [401, 403]:
                raise ValueError(f"Authentication rejected: {e.reason}")
            # Other errors might be due to property access
            logger.warning(f"Auth test returned {e.resp.status}, but continuing")
    
    def query_search_analytics(
        self,
        start_date: str,
        end_date: str,
        dimensions: List[str],
        row_limit: int = MAX_ROWS_PER_QUERY,
        search_type: str = "web"
    ) -> List[Dict]:
        """
        Query Search Analytics API with pagination support.
        
        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            dimensions: List of dimensions to group by
            row_limit: Maximum rows to return
            search_type: Type of search results
            
        Returns:
            List of row data from API
        """
        if not self.service:
            raise RuntimeError("GSC service not initialized. Call authenticate() first.")
        
        all_rows = []
        start_row = 0
        
        while True:
            request_body = {
                "startDate": start_date,
                "endDate": end_date,
                "dimensions": dimensions,
                "rowLimit": min(MAX_ROWS_PER_QUERY, row_limit - len(all_rows)),
                "startRow": start_row,
                "type": search_type,
                "dimensionFilterGroups": [{
                    "groupType": "and",
                    "filters": []
                }]
            }
            
            try:
                logger.info(f"Querying rows {start_row} to {start_row + MAX_ROWS_PER_QUERY}")
                response = self.service.searchanalytics().query(
                    siteUrl=self.property_url,
                    body=request_body
                ).execute()
                
                batch_rows = response.get('rows', [])
                if not batch_rows:
                    break
                    
                all_rows.extend(batch_rows)
                start_row += len(batch_rows)
                
                # Check if we have all rows or reached limit
                if len(batch_rows) < MAX_ROWS_PER_QUERY or len(all_rows) >= row_limit:
                    break
                    
            except HttpError as e:
                logger.error(f"GSC API query failed: {str(e)}")
                # If it's a 400 error for too many rows, adjust strategy
                if "rowLimit" in str(e) and start_row > 0:
                    logger.warning("Reducing batch size due to API limits")
                    break
                raise
        
        logger.info(f"Retrieved {len(all_rows)} total rows from GSC API")
        return all_rows[:row_limit]
    
    def fetch_sitemap_status(self) -> List[SitemapStatus]:
        """Fetch sitemap submission status for the property."""
        if not self.service:
            raise RuntimeError("GSC service not initialized. Call authenticate() first.")
        
        try:
            response = self.service.sitemaps().list(siteUrl=self.property_url).execute()
            sitemaps = response.get('sitemap', [])
            
            status_list = []
            for sitemap in sitemaps:
                status = SitemapStatus(
                    path=sitemap.get('path', ''),
                    last_submitted=sitemap.get('lastSubmitted', ''),
                    last_downloaded=sitemap.get('lastDownloaded', ''),
                    is_pending=sitemap.get('isPending', False),
                    errors=sitemap.get('errors', []),
                    warning_count=int(sitemap.get('warnings', '0')),
                    url_count=int(sitemap.get('contents', [{}])[0].get('submitted', '0') if sitemap.get('contents') else 0)
                )
                status_list.append(status)
            
            logger.info(f"Retrieved {len(status_list)} sitemaps")
            return status_list
            
        except HttpError as e:
            logger.error(f"Failed to fetch sitemaps: {str(e)}")
            return []

def filter_by_search_appearance(rows: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """
    Separate rows with and without searchAppearance data.
    
    Args:
        rows: Raw rows from GSC API
        
    Returns:
        Tuple of (rows_with_appearance, rows_without_appearance)
    """
    with_appearance = []
    without_appearance = []
    
    for row in rows:
        keys = row.get('keys', [])
        
        # Determine if searchAppearance exists based on dimensions
        # When dimensions = ['page', 'query', 'searchAppearance'], index 2 is searchAppearance
        has_appearance = False
        for i, key in enumerate(keys):
            # Check if this key matches known search appearance values
            if key in SEARCH_APPEARANCE_TO_STRUCTURED_TYPE:
                has_appearance = True
                break
        
        if has_appearance:
            with_appearance.append(row)
        else:
            without_appearance.append(row)
    
    logger.info(f"Filtered {len(with_appearance)} rows with searchAppearance, "
                f"{len(without_appearance)} rows without")
    return with_appearance, without_appearance

def infer_index_coverage(
    pages: List[PageMetrics],
    queries: List[QueryMetrics]
) -> List[IndexCoverage]:
    """
    Infer index coverage from performance data.
    
    Since GSC API doesn't provide direct index coverage data,
    we infer based on impressions and search appearances.
    
    Args:
        pages: Normalized page metrics
        queries: Normalized query metrics
        
    Returns:
        List of inferred index coverage objects
    """
    coverage_list = []
    
    for page in pages:
        # Inference logic:
        # 1. High impressions → likely indexed
        # 2. Search appearances → confirmed indexed
        # 3. No impressions → possibly not indexed
        
        if page.impressions > 100:
            status = "indexed"
            confidence = min(1.0, page.impressions / 10000)  # Scale confidence with volume
        elif page.impressions > 0:
            status = "indexed"
            confidence = 0.5
        else:
            status = "unknown"
            confidence = 0.1
        
        # Boost confidence if we have search appearance data
        if page.search_appearances:
            confidence = min(1.0, confidence + 0.3)
        
        coverage = IndexCoverage(
            url=page.url,
            status=status,
            last_crawled=page.last_seen,
            impressions_last_28d=page.impressions,
            confidence=round(confidence, 2)
        )
        coverage_list.append(coverage)
    
    logger.info(f"Inferred index coverage for {len(coverage_list)} pages")
    return coverage_list

def normalize_gsc_data(
    rows_with_appearance: List[Dict],
    rows_without_appearance: List[Dict],
    end_date: datetime
) -> Tuple[List[PageMetrics], List[QueryMetrics]]:
    """
    Enhanced normalization with temporal awareness.
    
    Args:
        rows_with_appearance: Rows containing searchAppearance data
        rows_without_appearance: Rows without searchAppearance data
        end_date: End date of the data window
        
    Returns:
        Tuple of (normalized_pages, normalized_queries)
    """
    page_aggregates: Dict[str, Dict] = {}
    query_aggregates: Dict[str, Dict] = {}
    
    def process_row(row: Dict, has_appearance: bool = False):
        """Helper to process a single row."""
        keys = row.get('keys', [])
        if not keys:
            return
        
        # Extract data with flexible indexing
        page = keys[0] if len(keys) > 0 else ''
        query = keys[1] if len(keys) > 1 else ''
        appearance = keys[2] if len(keys) > 2 and has_appearance else None
        
        clicks = row.get('clicks', 0)
        impressions = row.get('impressions', 0)
        ctr = row.get('ctr', 0.0)
        position = row.get('position', 0.0)
        
        # Update page aggregates
        if page:
            if page not in page_aggregates:
                page_aggregates[page] = {
                    'clicks': 0,
                    'impressions': 0,
                    'ctr_weighted_sum': 0.0,
                    'position_weighted_sum': 0.0,
                    'total_weight': 0,
                    'search_appearances': set(),
                    'structured_data_types': set(),
                    'last_impression_date': None
                }
            
            data = page_aggregates[page]
            data['clicks'] += clicks
            data['impressions'] += impressions
            weight = impressions  # Weight by impressions
            data['ctr_weighted_sum'] += ctr * weight
            data['position_weighted_sum'] += position * weight
            data['total_weight'] += weight
            
            if appearance:
                data['search_appearances'].add(appearance)
                structured_type = SEARCH_APPEARANCE_TO_STRUCTURED_TYPE.get(appearance)
                if structured_type:
                    data['structured_data_types'].add(structured_type)
            
            # Track most recent date (simplified - using end_date)
            data['last_impression_date'] = end_date.isoformat()
        
        # Update query aggregates
        if query:
            if query not in query_aggregates:
                query_aggregates[query] = {
                    'clicks': 0,
                    'impressions': 0,
                    'ctr_weighted_sum': 0.0,
                    'position_weighted_sum': 0.0,
                    'total_weight': 0,
                    'search_appearances': set(),
                    'structured_data_types': set(),
                    'last_impression_date': None
                }
            
            data = query_aggregates[query]
            data['clicks'] += clicks
            data['impressions'] += impressions
            weight = impressions
            data['ctr_weighted_sum'] += ctr * weight
            data['position_weighted_sum'] += position * weight
            data['total_weight'] += weight
            
            if appearance:
                data['search_appearances'].add(appearance)
                structured_type = SEARCH_APPEARANCE_TO_STRUCTURED_TYPE.get(appearance)
                if structured_type:
                    data['structured_data_types'].add(structured_type)
            
            data['last_impression_date'] = end_date.isoformat()
    
    # Process all rows
    for row in rows_with_appearance:
        process_row(row, has_appearance=True)
    
    for row in rows_without_appearance:
        process_row(row, has_appearance=False)
    
    # Convert to normalized objects
    normalized_pages = []
    for url, data in page_aggregates.items():
        if data['total_weight'] > 0:
            page = PageMetrics(
                url=url,
                clicks=data['clicks'],
                impressions=data['impressions'],
                ctr=data['ctr_weighted_sum'] / data['total_weight'],
                position=data['position_weighted_sum'] / data['total_weight'],
                search_appearances=list(data['search_appearances']),
                structured_data_types=list(data['structured_data_types']),
                last_seen=data['last_impression_date'] or end_date.isoformat()
            )
            normalized_pages.append(page)
    
    normalized_queries = []
    for query_text, data in query_aggregates.items():
        if data['total_weight'] > 0 and query_text:
            query = QueryMetrics(
                query=query_text,
                clicks=data['clicks'],
                impressions=data['impressions'],
                ctr=data['ctr_weighted_sum'] / data['total_weight'],
                position=data['position_weighted_sum'] / data['total_weight'],
                search_appearances=list(data['search_appearances']),
                structured_data_types=list(data['structured_data_types']),
                last_seen=data['last_impression_date'] or end_date.isoformat()
            )
            normalized_queries.append(query)
    
    logger.info(f"Normalized {len(normalized_pages)} pages and {len(normalized_queries)} queries")
    return normalized_pages, normalized_queries

def build_state_patch(
    pages: List[PageMetrics],
    queries: List[QueryMetrics],
    sitemap_statuses: List[SitemapStatus],
    window_days: int = DEFAULT_WINDOW_DAYS
) -> Dict:
    """
    Build the final state patch in the required format.
    
    Args:
        pages: Normalized page metrics
        queries: Normalized query metrics
        sitemap_statuses: Sitemap status information
        window_days: Number of days in the data window
        
    Returns:
        State patch dictionary
    """
    # Infer index coverage
    index_coverage = infer_index_coverage(pages, queries)
    
    # Convert dataclasses to dictionaries
    pages_dict = [asdict(p) for p in pages]
    queries_dict = [asdict(q) for q in queries]
    sitemap_dict = [asdict(s) for s in sitemap_statuses]
    coverage_dict = [asdict(c) for c in index_coverage]
    
    # Build the state patch
    state_patch = {
        "inputs": {
            "gsc": {
                "collected_at": datetime.utcnow().isoformat() + "Z",
                "window_days": window_days,
                "pages": pages_dict,
                "queries": queries_dict,
                "index_coverage": coverage_dict,
                "sitemap_status": sitemap_dict
            }
        }
    }
    
    return state_patch

def collect_gsc(state: Dict) -> Dict:
    """
    Main node function to collect GSC data with enhanced error handling.
    
    Reads:
        - state['run']['domain']
        - state['config']['integrations']['gsc_property_url']
        - state['config']['integrations']['gsc_credentials']
        
    Writes:
        - state['inputs']['gsc']
    """
    logger.info("Starting GSC data collection")
    
    try:
        # Extract configuration from state
        domain = state.get('run', {}).get('domain', '')
        integrations = state.get('config', {}).get('integrations', {})
        
        property_url = integrations.get('gsc_property_url')
        credentials_source = integrations.get('gsc_credentials')
        
        # Fallback options for credentials
        if not credentials_source:
            credentials_source = integrations.get('gsc_service_account')
        if not credentials_source:
            credentials_source = integrations.get('gsc_credentials_path')
        
        # Validate configuration
        if not property_url and domain:
            property_url = f"https://{domain}/"
        
        if not property_url:
            raise ValueError(
                "GSC property URL not configured. "
                "Set config.integrations.gsc_property_url or provide domain in run.domain"
            )
        
        if not credentials_source:
            # Check for environment variable as last resort
            env_creds = os.getenv('GSC_CREDENTIALS_JSON')
            if env_creds:
                credentials_source = env_creds
                logger.info("Using credentials from GSC_CREDENTIALS_JSON environment variable")
            else:
                raise ValueError(
                    "GSC credentials not found. Provide one of: "
                    "config.integrations.gsc_credentials, "
                    "config.integrations.gsc_service_account, "
                    "config.integrations.gsc_credentials_path, "
                    "or GSC_CREDENTIALS_JSON environment variable"
                )
        
        # Calculate date range
        # end_date = datetime.utcnow().date()
        # start_date = end_date - timedelta(days=DEFAULT_WINDOW_DAYS)

        # Calculate date range
        end_date = datetime.utcnow().date()
        start_date = end_date - timedelta(days=DEFAULT_WINDOW_DAYS)

        # 🔥 Add this line to grab search_type safely
        search_type = integrations.get("gsc_search_type", "web")

        
        # Initialize and authenticate GSC client
        logger.info(f"Initializing GSC client for property: {property_url}")
        gsc_client = GSCClient(credentials_source, property_url)
        gsc_client.authenticate()
        
        # Query search analytics data
        # dimensions = ["page", "query", "searchAppearance"]
        # dimensions = ["searchAppearance"]

        # --- Query 1: Just searchAppearance ---
        rows_with_appearance = gsc_client.query_search_analytics(
            start_date=start_date.isoformat(),
            end_date=end_date.isoformat(),
            dimensions=["searchAppearance"],
            row_limit=50000,
            search_type=search_type
        )

        # --- Query 2: Just page + query ---
        rows_without_appearance = gsc_client.query_search_analytics(
            start_date=start_date.isoformat(),
            end_date=end_date.isoformat(),
            dimensions=["page", "query"],
            row_limit=50000,
            search_type=search_type
        )


        logger.info(f"Querying data from {start_date} to {end_date}")
        
        rows = gsc_client.query_search_analytics(
            start_date=start_date.isoformat(),
            end_date=end_date.isoformat(),
            dimensions=[rows_with_appearance, rows_without_appearance],
            row_limit=50000  # Increased for better coverage
        )
        
        if not rows:
            logger.warning("No data returned from GSC API")
            # Return empty but valid structure
            return build_state_patch([], [], [], DEFAULT_WINDOW_DAYS)
        
        # Filter and normalize data
        rows_with_appearance, rows_without_appearance = filter_by_search_appearance(rows)
        
        normalized_pages, normalized_queries = normalize_gsc_data(
            rows_with_appearance,
            rows_without_appearance,
            end_date
        )
        
        # Fetch additional data
        sitemap_statuses = gsc_client.fetch_sitemap_status()
        
        # Build final state patch
        state_patch = build_state_patch(
            pages=normalized_pages,
            queries=normalized_queries,
            sitemap_statuses=sitemap_statuses,
            window_days=DEFAULT_WINDOW_DAYS
        )
        
        logger.info("✓ GSC data collection completed successfully")
        logger.info(f"  Pages: {len(normalized_pages)}")
        logger.info(f"  Queries: {len(normalized_queries)}")
        logger.info(f"  Sitemaps: {len(sitemap_statuses)}")
        
        return state_patch
        
    except Exception as e:
        logger.error(f"✗ GSC collection failed: {str(e)}", exc_info=True)
        
        # Return minimal valid structure even on error
        error_patch = {
            "inputs": {
                "gsc": {
                    "collected_at": datetime.utcnow().isoformat() + "Z",
                    "window_days": DEFAULT_WINDOW_DAYS,
                    "pages": [],
                    "queries": [],
                    "index_coverage": [],
                    "sitemap_status": [],
                    "error": str(e),
                    "success": False
                }
            }
        }
        return error_patch

# Example usage with better credential handling
# if __name__ == "__main__":
#     # Test with multiple credential formats
#     test_cases = [
#         # {
#         #     "name": "File Path Example",
#         #     "state": {
#         #         "run": {"domain": "example.com"},
#         #         "config": {
#         #             "integrations": {
#         #                 "gsc_property_url": "https://www.paddleaurum.com/",
#         #                 "gsc_credentials_path": "secret\seo-agent-486408-51492a0c6774.json"
#         #             }
#         #         }
#         #     }
#         # },
#         {
#             "name": "JSON String Example",
#             "state": {
#                 "run": {"domain": "paddleaurum.com"},
#                 "config": {
#                     "integrations": {
#                         "gsc_property_url": "sc-domain:paddleaurum.com",
#                         "gsc_credentials": """
#                         {"type": "service_account",
#                             "project_id": "seo-agent-486408",
#                             "private_key_id": "51492a0c6774672a3e75acf377280e31d9a41713",
#                             "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDyHmPb2Fea4ScV\nMfbqeQHSfFV+A8reH6V/r8S+bDiRKzZPaIdGrkFrd/pj7oaTnpEgqQeMB4GSsyc3\nIh6dNDOjly2LfUdcxG3k66fze5SsiQlNiw6qJb29fQG4OXaJYPwLfyOuUzLs25gq\n6sZEkJ3Far48cswy0x+MB2AwqXaR7dQ7F1JpSAfdIoF+cO3eO+G2pxI2Ehg0SmNb\nw+y3ICeapiojT7sLrRYYsy/4MJxctVyn3RGA4lE3k3ZXperfsw3jmRkQ7laf7nwZ\nn1xkhMuf3dBrBQrKfEzsEq5kF3ePoPJALscbSQIvMMStpCiEtK5SlcsOJ4qOItw4\neLqCxtCxAgMBAAECggEAHui3SdUcFfchZKjMw1EwZN0fkudMCkBTAfJ/9Ole7VVt\naQTIpELRsjUOX5yPTKAlGdzKTFuW9JHvcy/lZZ9I6lz94P1c02B2QQ++kKxZpg8W\ng3I5rIoF539ibVDUq6QYeSVBels/uJIuImh3aEBoC9BdOd1T6Tcfo2I7quqvauCI\nsmfvpAP3LVZ0ODlduUBovVavISA48yOQfmed2uQCuMihj35NrJ/qZItMQEkELZfa\nWfJFiyBM9fMcVfFS3tQh1sQsoNKnSDovrFGh4wtOMSqPlseeAcnjZasX/0Q1rjo6\nGZ3COaETJ83g+6wXNDciHa3AHYhd1sJ5OL8AMOGMWQKBgQD7Gtn5d09D9Uiv6q8U\nH8MEj6TanRcXLR0Lb1MRmjilI2lJNmSs0AofuxCqcMW6QBzo2eeDeeU7O5a5TliI\njRVRv7xdq/TSBmA3jKlq5Yue57J77VrcIS/Vn1q4Wdro/Ia82Bc6nCtqIeXf6sGN\nSsALepYHCeNUkCUElBp5uUJOzQKBgQD21rFYB1tNCaCVO2b5ty+JF2vT8esynd93\niFHxmjVJb94+Ox24myVq1tAyGbMM3o6Cd5t1agZHKc5P6+R72i51mVf8IANSTM0S\n2+UP57xafsZJcyXh0RhKVXhPzdwkwKp8CDd/iqhBoYZKj1sDnj8KdH9tgcATkLc8\nzvBUtdIBdQKBgQDXI3xaJoS9PuxV/IaggU3/HGsr+qeL7dUStQDA9hdONXbBiV7c\nSSpDYWy3+wMNvlyGjBu1I7zo8PcEMUHdTLNVZScoQmnxgBDzxwpoUd87+FuNniDY\ntX5cUrnRdPr/30w5hBLy3la8Cer/3AyU19SOgSsFQM2K6C0vLQ05+SX8iQKBgDLL\nLyc8LE+IAaFz3dbZGVEOnsO4bnu1/Pwt8x7SatC30nXf2FfapTJ9Dc6hdzTYv16d\nmkHpFW2jbq8HjbbmyNSuP2rEc6qF31VYJqTFPP8qQ0duCIeVbXgualmRtRFjDDIF\nxtXaBJGs2WOSaFowQCy4mFhCcPIVv38l3lKwL4zZAoGBAJR3N4Y/6uyI57vUp6mI\nWsYZASmLwNyNdkgYOaJYPrtuhqNEosb7MmkLsIbsm2iw7g7W9H5t7y49mwRg3Lc1\nZpvx3CMWvXHvMgxS3vMfbcA82gCQvz0Cb7J9i7I1Kbqw9yg4LjFXmKhWNb/sUKEW\nIU9J1MJKHT38qZwpY56zUrD4\n-----END PRIVATE KEY-----\n",
#                             "client_email": "gsc-data-collection@seo-agent-486408.iam.gserviceaccount.com",
#                             "client_id": "111419250013032075846",
#                             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
#                             "token_uri": "https://oauth2.googleapis.com/token",
#                             "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
#                             "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsc-data-collection%40seo-agent-486408.iam.gserviceaccount.com",
#                             "universe_domain": "googleapis.com"
#                         }    

#                         """
#                     }
#                 }
#             }
#         },
#         # {
#     #         "name": "Minimal Domain Example",
#     #         "state": {
#     #             "run": {"domain": "example.com"},
#     #             "config": {
#     #                 "integrations": {
#     #                     "gsc_property_url": "example.com",
#     #                     "gsc_credentials": {
#     #                         "type": "service_account",
#     #                         "project_id": "test",
#     #                         "private_key_id": "test",
#     #                         "private_key": "test",
#     #                         "client_email": "test@example.com",
#     #                         "client_id": "test"
#     #                     }
#     #                 }
#     #             }
#     #         }
#     #     }
#     ]
    
#     print("GSC Data Collection Node - Enhanced Version")
#     print("=" * 50)
    
#     for test_case in test_cases:
#         print(f"\nTesting: {test_case['name']}")
#         print("-" * 30)
        
#         try:
#             # Set environment variable for testing
#             if "JSON String Example" in test_case["name"]:
#                 os.environ['GSC_CREDENTIALS_JSON'] = test_case['state']['config']['integrations']['gsc_credentials']
            
#             result = collect_gsc(test_case['state'])
            
#             if result['inputs']['gsc'].get('success', True):
#                 print(f"✓ Collection successful")
#                 print(f"  Pages collected: {len(result['inputs']['gsc']['pages'])}")
#                 print(f"  Queries collected: {len(result['inputs']['gsc']['queries'])}")
#                 print(f"  Sitemaps: {len(result['inputs']['gsc']['sitemap_status'])}")
#                 print(f"  Index coverage inferred: {len(result['inputs']['gsc']['index_coverage'])}")
#             else:
#                 print(f"✗ Collection failed: {result['inputs']['gsc'].get('error')}")
                
#         except Exception as e:
#             print(f"✗ Test failed: {str(e)}")
    
#     print("\n" + "=" * 50)
#     print("Configuration Notes:")
#     print("1. Credentials can be: file path, JSON string, or Python dict")
#     print("2. Property URL can be: https://domain.com/, sc-domain:domain.com, or just domain.com")
#     print("3. Index coverage is inferred from performance data")
#     print("4. Structured data types are mapped from searchAppearance values")


In [50]:
!pip install pytest -Uq


In [51]:
import pytest
from datetime import datetime
from unittest.mock import MagicMock, patch

# import your node
# from your_module.collect_gsc import collect_gsc

@pytest.fixture
def mock_gsc_service():
    """
    Builds a fake googleapiclient service object:
    service.searchanalytics().query().execute()
    """
    service = MagicMock()

    # Fake Search Analytics rows
    service.searchanalytics.return_value.query.return_value.execute.return_value = {
        "rows": [
            {
                "keys": ["https://paddleaurum.com/page-1", "Rich Result"],
                "clicks": 10,
                "impressions": 100,
                "ctr": 0.1,
                "position": 3.2,
            },
            {
                "keys": ["https://paddleaurum.com/page-2", "FAQ"],
                "clicks": 5,
                "impressions": 40,
                "ctr": 0.125,
                "position": 6.0,
            },
        ]
    }

    return service


@patch("your_module.collect_gsc.authenticate_gsc")
def test_collect_gsc_basic(mock_authenticate, mock_gsc_service):
    """
    Basic happy-path test:
    - valid credentials
    - valid search analytics rows
    - correct output structure
    """

    mock_authenticate.return_value = mock_gsc_service

    state = {
        "run": {
            "domain": "example.com"
        },
        "config": {
            "integrations": {
                "gsc_property_url": "https://paddleaurum.com/",
                "gsc_credentials": {
                    "type": "service_account",
                    "project_id": "seo-agent-486408",
                    "private_key_id": "51492a0c6774672a3e75acf377280e31d9a41713",
                    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDyHmPb2Fea4ScV\nMfbqeQHSfFV+A8reH6V/r8S+bDiRKzZPaIdGrkFrd/pj7oaTnpEgqQeMB4GSsyc3\nIh6dNDOjly2LfUdcxG3k66fze5SsiQlNiw6qJb29fQG4OXaJYPwLfyOuUzLs25gq\n6sZEkJ3Far48cswy0x+MB2AwqXaR7dQ7F1JpSAfdIoF+cO3eO+G2pxI2Ehg0SmNb\nw+y3ICeapiojT7sLrRYYsy/4MJxctVyn3RGA4lE3k3ZXperfsw3jmRkQ7laf7nwZ\nn1xkhMuf3dBrBQrKfEzsEq5kF3ePoPJALscbSQIvMMStpCiEtK5SlcsOJ4qOItw4\neLqCxtCxAgMBAAECggEAHui3SdUcFfchZKjMw1EwZN0fkudMCkBTAfJ/9Ole7VVt\naQTIpELRsjUOX5yPTKAlGdzKTFuW9JHvcy/lZZ9I6lz94P1c02B2QQ++kKxZpg8W\ng3I5rIoF539ibVDUq6QYeSVBels/uJIuImh3aEBoC9BdOd1T6Tcfo2I7quqvauCI\nsmfvpAP3LVZ0ODlduUBovVavISA48yOQfmed2uQCuMihj35NrJ/qZItMQEkELZfa\nWfJFiyBM9fMcVfFS3tQh1sQsoNKnSDovrFGh4wtOMSqPlseeAcnjZasX/0Q1rjo6\nGZ3COaETJ83g+6wXNDciHa3AHYhd1sJ5OL8AMOGMWQKBgQD7Gtn5d09D9Uiv6q8U\nH8MEj6TanRcXLR0Lb1MRmjilI2lJNmSs0AofuxCqcMW6QBzo2eeDeeU7O5a5TliI\njRVRv7xdq/TSBmA3jKlq5Yue57J77VrcIS/Vn1q4Wdro/Ia82Bc6nCtqIeXf6sGN\nSsALepYHCeNUkCUElBp5uUJOzQKBgQD21rFYB1tNCaCVO2b5ty+JF2vT8esynd93\niFHxmjVJb94+Ox24myVq1tAyGbMM3o6Cd5t1agZHKc5P6+R72i51mVf8IANSTM0S\n2+UP57xafsZJcyXh0RhKVXhPzdwkwKp8CDd/iqhBoYZKj1sDnj8KdH9tgcATkLc8\nzvBUtdIBdQKBgQDXI3xaJoS9PuxV/IaggU3/HGsr+qeL7dUStQDA9hdONXbBiV7c\nSSpDYWy3+wMNvlyGjBu1I7zo8PcEMUHdTLNVZScoQmnxgBDzxwpoUd87+FuNniDY\ntX5cUrnRdPr/30w5hBLy3la8Cer/3AyU19SOgSsFQM2K6C0vLQ05+SX8iQKBgDLL\nLyc8LE+IAaFz3dbZGVEOnsO4bnu1/Pwt8x7SatC30nXf2FfapTJ9Dc6hdzTYv16d\nmkHpFW2jbq8HjbbmyNSuP2rEc6qF31VYJqTFPP8qQ0duCIeVbXgualmRtRFjDDIF\nxtXaBJGs2WOSaFowQCy4mFhCcPIVv38l3lKwL4zZAoGBAJR3N4Y/6uyI57vUp6mI\nWsYZASmLwNyNdkgYOaJYPrtuhqNEosb7MmkLsIbsm2iw7g7W9H5t7y49mwRg3Lc1\nZpvx3CMWvXHvMgxS3vMfbcA82gCQvz0Cb7J9i7I1Kbqw9yg4LjFXmKhWNb/sUKEW\nIU9J1MJKHT38qZwpY56zUrD4\n-----END PRIVATE KEY-----\n",
                    "client_email": "gsc-data-collection@seo-agent-486408.iam.gserviceaccount.com",
                    "client_id": "111419250013032075846",
                    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
                    "token_uri": "https://oauth2.googleapis.com/token",
                    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
                    "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsc-data-collection%40seo-agent-486408.iam.gserviceaccount.com",
                    "universe_domain": "googleapis.com"
                }
            }
        }
    }

    result = collect_gsc(state)

    assert "inputs" in result
    assert "gsc" in result["inputs"]

    gsc = result["inputs"]["gsc"]

    # ---- schema assertions ----
    assert set(gsc.keys()) == {
        "collected_at",
        "window_days",
        "pages",
        "queries",
        "index_coverage",
        "sitemap_status",
    }

    assert isinstance(gsc["pages"], list)
    assert isinstance(gsc["queries"], list)
    assert isinstance(gsc["index_coverage"], list)
    assert isinstance(gsc["sitemap_status"], list)

    # ---- semantic assertions ----
    assert len(gsc["pages"]) == 2

    page = gsc["pages"][0]
    assert "url" in page
    assert "search_appearances" in page
    assert "structured_data_types" in page

    assert page["structured_data_types"]  # inferred from searchAppearance


In [52]:
def test_collect_gsc_queries_present(mock_authenticate, mock_gsc_service):
    mock_authenticate.return_value = mock_gsc_service

    state = {
        "run": {"domain": "example.com"},
        "config": {"integrations": {"gsc_credentials": "dummy"}}
    }

    result = collect_gsc(state)
    queries = result["inputs"]["gsc"]["queries"]

    assert isinstance(queries, list)


In [85]:
import os

os.environ["GSC_CREDENTIALS_JSON"] = open(
    r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\secret\seo-agent-486408-51492a0c6774.json", "r"
).read()


In [93]:
def run_collect_gsc_node():
    state = {
        "run": {
            "domain": "paddleaurum.com"
        },
        "config": {
            "integrations": {
                "gsc_property_url": "sc-domain:paddleaurum.com",
                "gsc_window_days": 28,
                "gsc_search_type": "web",
                "gsc_row_limit": 50_000,
                "gsc_max_pages": 10_000,
                "gsc_max_queries": 10_000,
            }
        }
    }

    patch = collect_gsc(state)
    gsc = patch["inputs"]["gsc"]

    print("✅ GSC collection complete")
    print("Pages:", len(gsc["pages"]))
    print("Queries:", len(gsc["queries"]))
    print("Index coverage:", len(gsc["index_coverage"]))
    print("Sitemaps:", len(gsc["sitemap_status"]))

    return patch



In [94]:
if __name__ == "__main__":
    run_collect_gsc_node()


INFO:__main__:Starting GSC data collection
INFO:__main__:Using credentials from GSC_CREDENTIALS_JSON environment variable
INFO:__main__:Initializing GSC client for property: sc-domain:paddleaurum.com
INFO:__main__:Loading credentials from JSON string
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:__main__:✓ Authenticated to GSC for property: sc-domain:paddleaurum.com
INFO:__main__:Querying rows 0 to 25000
INFO:__main__:Retrieved 0 total rows from GSC API
INFO:__main__:Querying rows 0 to 25000
INFO:__main__:Retrieved 0 total rows from GSC API
INFO:__main__:Querying data from 2026-01-07 to 2026-02-04
INFO:__main__:Querying rows 0 to 25000
INFO:__main__:Retrieved 0 total rows from GSC API
INFO:__main__:Inferred index coverage for 0 pages


✅ GSC collection complete
Pages: 0
Queries: 0
Index coverage: 0
Sitemaps: 0


In [None]:
import os
import json

os.environ["GSC_CREDENTIALS_JSON"] = json.dumps({
    "type": "service_account",
    "project_id": "seo-agent-486408",
    "private_key_id": "51492a0c6774672a3e75acf377280e31d9a41713",
    "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDyHmPb2Fea4ScV\nMfbqeQHSfFV+A8reH6V/r8S+bDiRKzZPaIdGrkFrd/pj7oaTnpEgqQeMB4GSsyc3\nIh6dNDOjly2LfUdcxG3k66fze5SsiQlNiw6qJb29fQG4OXaJYPwLfyOuUzLs25gq\n6sZEkJ3Far48cswy0x+MB2AwqXaR7dQ7F1JpSAfdIoF+cO3eO+G2pxI2Ehg0SmNb\nw+y3ICeapiojT7sLrRYYsy/4MJxctVyn3RGA4lE3k3ZXperfsw3jmRkQ7laf7nwZ\nn1xkhMuf3dBrBQrKfEzsEq5kF3ePoPJALscbSQIvMMStpCiEtK5SlcsOJ4qOItw4\neLqCxtCxAgMBAAECggEAHui3SdUcFfchZKjMw1EwZN0fkudMCkBTAfJ/9Ole7VVt\naQTIpELRsjUOX5yPTKAlGdzKTFuW9JHvcy/lZZ9I6lz94P1c02B2QQ++kKxZpg8W\ng3I5rIoF539ibVDUq6QYeSVBels/uJIuImh3aEBoC9BdOd1T6Tcfo2I7quqvauCI\nsmfvpAP3LVZ0ODlduUBovVavISA48yOQfmed2uQCuMihj35NrJ/qZItMQEkELZfa\nWfJFiyBM9fMcVfFS3tQh1sQsoNKnSDovrFGh4wtOMSqPlseeAcnjZasX/0Q1rjo6\nGZ3COaETJ83g+6wXNDciHa3AHYhd1sJ5OL8AMOGMWQKBgQD7Gtn5d09D9Uiv6q8U\nH8MEj6TanRcXLR0Lb1MRmjilI2lJNmSs0AofuxCqcMW6QBzo2eeDeeU7O5a5TliI\njRVRv7xdq/TSBmA3jKlq5Yue57J77VrcIS/Vn1q4Wdro/Ia82Bc6nCtqIeXf6sGN\nSsALepYHCeNUkCUElBp5uUJOzQKBgQD21rFYB1tNCaCVO2b5ty+JF2vT8esynd93\niFHxmjVJb94+Ox24myVq1tAyGbMM3o6Cd5t1agZHKc5P6+R72i51mVf8IANSTM0S\n2+UP57xafsZJcyXh0RhKVXhPzdwkwKp8CDd/iqhBoYZKj1sDnj8KdH9tgcATkLc8\nzvBUtdIBdQKBgQDXI3xaJoS9PuxV/IaggU3/HGsr+qeL7dUStQDA9hdONXbBiV7c\nSSpDYWy3+wMNvlyGjBu1I7zo8PcEMUHdTLNVZScoQmnxgBDzxwpoUd87+FuNniDY\ntX5cUrnRdPr/30w5hBLy3la8Cer/3AyU19SOgSsFQM2K6C0vLQ05+SX8iQKBgDLL\nLyc8LE+IAaFz3dbZGVEOnsO4bnu1/Pwt8x7SatC30nXf2FfapTJ9Dc6hdzTYv16d\nmkHpFW2jbq8HjbbmyNSuP2rEc6qF31VYJqTFPP8qQ0duCIeVbXgualmRtRFjDDIF\nxtXaBJGs2WOSaFowQCy4mFhCcPIVv38l3lKwL4zZAoGBAJR3N4Y/6uyI57vUp6mI\nWsYZASmLwNyNdkgYOaJYPrtuhqNEosb7MmkLsIbsm2iw7g7W9H5t7y49mwRg3Lc1\nZpvx3CMWvXHvMgxS3vMfbcA82gCQvz0Cb7J9i7I1Kbqw9yg4LjFXmKhWNb/sUKEW\nIU9J1MJKHT38qZwpY56zUrD4\n-----END PRIVATE KEY-----\n",
    "client_email": "gsc-data-collection@seo-agent-486408.iam.gserviceaccount.com",
    "client_id": "111419250013032075846",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsc-data-collection%40seo-agent-486408.iam.gserviceaccount.com",
    "universe_domain": "googleapis.com"
})


# If collect_gsc is in another module, import it like:
# from your_module.gsc_node import collect_gsc

def run_collect_gsc_node():
    # --- Option A: credentials via env var (JSON string) ---
    # os.environ["GSC_CREDENTIALS_JSON"] = json.dumps({...service_account_json...})

    # --- Option B: credentials via file path (recommended locally) ---
    # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/absolute/path/to/service_account.json"
    
    credentials_path = "C:/path/to/your/actual/credentials.json"  # Use forward slashes
    service = authenticate_gsc(credentials_path)

    state = {
        "run": {
            "domain": "paddleaurum.com"
        },
        "config": {
            "integrations": {
                # URL-prefix property example (keep trailing slash or node will add it)
                "gsc_property_url": "https://www.paddleaurum.com/",

                # Or domain property example:
                # "gsc_property_url": "sc-domain:example.com",

                # Credentials source: can be dict, json string, base64 json, or path
                # If you don't set this, node falls back to env vars:
                #   GSC_CREDENTIALS_JSON / GSC_CREDENTIALS_B64 / GOOGLE_APPLICATION_CREDENTIALS
                "gsc_credentials": json.loads(os.environ["GSC_CREDENTIALS_JSON"]),

                # Optional tuning
                "gsc_window_days": 28,
                "gsc_search_type": "web",         # web|news|video|image|discover|googleNews
                "gsc_row_limit": 50_000,          # cap per report (page+appearance, query+appearance)
                "gsc_max_pages": 10_000,
                "gsc_max_queries": 10_000,
            }
        }
    }

    patch = collect_gsc(state)
    gsc = patch["inputs"]["gsc"]

    print("✅ collect_gsc_node finished")
    print("Collected at:", gsc["collected_at"])
    print("Window days:", gsc["window_days"])
    print("Pages:", len(gsc["pages"]))
    print("Queries:", len(gsc["queries"]))
    print("Index coverage:", len(gsc["index_coverage"]))
    print("Sitemaps:", len(gsc["sitemap_status"]))

    # quick peek
    if gsc["pages"]:
        print("\n--- sample page ---")
        print(json.dumps(gsc["pages"][0], indent=2))

    if gsc["queries"]:
        print("\n--- sample query ---")
        print(json.dumps(gsc["queries"][0], indent=2))

    return patch


if __name__ == "__main__":
    run_collect_gsc_node()


In [89]:
import json, os
from pathlib import Path

# sa_path = Path(r"C:\real\path\to\service_account.json")
# os.environ["GSC_CREDENTIALS_JSON"] = sa_path.read_text(encoding="utf-8")

import json
from pathlib import Path

def run_collect_gsc_node():
    credentials_path = Path(r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\secret\seo-agent-486408-51492a0c6774.json")
    if not credentials_path.exists():
        raise FileNotFoundError(f"Missing credentials: {credentials_path}")

    state = {
        "run": {"domain": "paddleaurum.com"},
        "config": {
            "integrations": {
                "gsc_property_url": "sc-domain:paddleaurum.com",
                "gsc_credentials": str(credentials_path),  # pass the path string
                "gsc_window_days": 28,
                "gsc_search_type": "web",
                "gsc_row_limit": 50_000,
                "gsc_max_pages": 10_000,
                "gsc_max_queries": 10_000,
            }
        }
    }

    patch = collect_gsc(state)
    print("Collected pages:", len(patch["inputs"]["gsc"]["pages"]))
    return patch


In [90]:
if __name__ == "__main__":
    run_collect_gsc_node()


INFO:__main__:Starting GSC data collection
INFO:__main__:Initializing GSC client for property: sc-domain:paddleaurum.com
INFO:__main__:Loading credentials from file: C:\Users\Aurum\vscode\E-commerce-SEO-Agent\secret\seo-agent-486408-51492a0c6774.json
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:__main__:✓ Authenticated to GSC for property: sc-domain:paddleaurum.com
ERROR:__main__:✗ GSC collection failed: name 'search_type' is not defined
Traceback (most recent call last):
  File "C:\Users\Aurum\AppData\Local\Temp\ipykernel_3372\3535914434.py", line 672, in collect_gsc
    search_type=search_type
NameError: name 'search_type' is not defined


Collected pages: 0


In [63]:
from pathlib import Path

credentials_path = Path(r"C:\Users\Aurum\vscode\E-commerce-SEO-Agent\secret\seo-agent-486408-51492a0c6774.json")
service = authenticate_gsc(str(credentials_path))


In [64]:
service.sites().list().execute()


{'siteEntry': [{'siteUrl': 'sc-domain:paddleaurum.com',
   'permissionLevel': 'siteOwner'}]}

In [65]:
def debug_gsc_access(service):
    sites = service.sites().list().execute()
    for s in sites.get("siteEntry", []):
        print(f"{s['siteUrl']} → {s['permissionLevel']}")


In [66]:
if __name__ == "__main__":
    debug_gsc_access(service)


sc-domain:paddleaurum.com → siteOwner
