In [None]:
import requests
import pandas as pd
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from typing import Dict, List, Optional
import time
import logging
from urllib.parse import urlparse
from datetime import datetime
from ratelimit import limits, sleep_and_retry

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('wordpress_indexing_requester.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Rate limits
CALLS_PER_MINUTE = 60
ONE_MINUTE = 60

@sleep_and_retry
@limits(calls=CALLS_PER_MINUTE, period=ONE_MINUTE)
def rate_limited_api_call(func):
    return func()

def retry_on_error(func):
    """Decorator to retry API calls on error with exponential backoff."""
    def wrapper(*args, **kwargs):
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            try:
                return func(*args, **kwargs)
            except HttpError as e:
                if e.resp.status == 500:  # Internal server error
                    retry_count += 1
                    if retry_count == max_retries:
                        logger.error(f"Max retries reached for API call: {e}")
                        return None
                    wait_time = 2 ** retry_count  # Exponential backoff
                    logger.info(f"API call failed with 500 error, retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    raise
            except Exception as e:
                raise
    return wrapper

class WordPressIndexRequester:
    def __init__(self, gsc_credentials_file: str, domains: List[str], batch_size: int = 5):
        """Initialize the WordPress Index Requester."""
        self.domains = [self._format_domain_url(domain) for domain in domains]
        self.batch_size = batch_size
        self.credentials = self._load_credentials(gsc_credentials_file)
        self.service = build("searchconsole", "v1", credentials=self.credentials)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.indexed_cache = {}  # Cache for indexing status

    def _load_credentials(self, credentials_file: str) -> Credentials:
        """Load Google Search Console credentials."""
        try:
            return Credentials.from_service_account_file(
                credentials_file,
                scopes=["https://www.googleapis.com/auth/webmasters"]
            )
        except Exception as e:
            logger.error(f"Failed to load credentials: {e}")
            raise

    def _format_domain_url(self, domain: str) -> str:
        """Format domain URL with proper scheme."""
        domain = domain.strip().lower()
        if not domain.startswith(('http://', 'https://')):
            domain = 'https://' + domain
        return domain.rstrip('/')

    def _get_site_url(self, url: str) -> str:
        """Get site URL from any URL."""
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}/"

    @retry_on_error
    def check_google_indexing(self, url: str) -> Optional[bool]:
        """Check if a URL is indexed in Google Search with retry logic."""
        # Check cache first
        if url in self.indexed_cache:
            return self.indexed_cache[url]

        try:
            url = self._format_domain_url(url)
            site_url = self._get_site_url(url)
            
            request = self.service.urlInspection().index().inspect(
                body={
                    "inspectionUrl": url,
                    "siteUrl": site_url
                }
            )
            response = request.execute()
            
            inspection_result = response.get("inspectionResult", {})
            index_status = inspection_result.get("indexStatusResult", {})
            coverage_state = index_status.get("coverageState")
            
            is_indexed = coverage_state == "INDEXED"
            self.indexed_cache[url] = is_indexed  # Cache the result
            return is_indexed

        except HttpError as e:
            if e.resp.status == 429:  # Rate limit exceeded
                logger.warning("Rate limit exceeded, waiting before retry...")
                time.sleep(60)  # Wait for 60 seconds
                return self.check_google_indexing(url)  # Retry
            else:
                logger.error(f"Error checking indexing status for {url}: {e}")
                return None
        except Exception as e:
            logger.error(f"Error checking indexing status for {url}: {e}")
            return None

    def get_non_indexed_posts(self, domain: str, max_retries: int = 3) -> List[Dict]:
        """Get all published posts from a WordPress domain and check their indexing status."""
        posts = []
        page = 1
        retries = max_retries
        base_url = domain

        while True:
            try:
                endpoint = f"{base_url}/wp-json/wp/v2/posts"
                response = self.session.get(
                    endpoint,
                    params={
                        'page': page,
                        'per_page': 100,
                        'status': 'publish'
                    },
                    timeout=15
                )
                
                # Handle pagination end gracefully
                if response.status_code == 400:
                    logger.info(f"Reached end of posts at page {page} for {base_url}")
                    break
                
                response.raise_for_status()
                current_posts = response.json()
                
                if not current_posts:
                    break

                # Check indexing status for each post
                for post in current_posts:
                    post_url = post.get("link")
                    if post_url:
                        def check_index():
                            return self.check_google_indexing(post_url)
                        
                        indexed = rate_limited_api_call(check_index)
                        if indexed is False:  # Only add non-indexed posts
                            posts.append(post)

                page += 1
                retries = max_retries
                time.sleep(1)

            except requests.exceptions.RequestException as e:
                logger.error(f"Error fetching posts from {base_url}: {e}")
                retries -= 1
                if retries <= 0:
                    break
                time.sleep(2 ** (max_retries - retries))

        return posts

    @retry_on_error
    def request_indexing(self, url: str) -> Dict:
        """Request indexing for a specific URL using the Indexing API."""
        try:
            url = self._format_domain_url(url)
            site_url = self._get_site_url(url)
            
            request = self.service.urlInspection().index().inspect(
                body={
                    "inspectionUrl": url,
                    "siteUrl": site_url
                }
            )
            response = request.execute()
            
            return {
                "URL": url,
                "Status": "Success",
                "Message": "Indexing requested successfully",
                "Response": str(response),
                "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
        except Exception as e:
            logger.error(f"Error requesting indexing for {url}: {e}")
            return {
                "URL": url,
                "Status": "Failed",
                "Message": str(e),
                "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

    def process_domain(self, domain: str) -> List[Dict]:
        """Process a single domain and request indexing for non-indexed posts."""
        results = []
        logger.info(f"Processing domain: {domain}")
        
        non_indexed_posts = self.get_non_indexed_posts(domain)
        logger.info(f"Found {len(non_indexed_posts)} non-indexed posts for {domain}")
        
        for post in non_indexed_posts:
            post_url = post.get("link")
            if post_url:
                result = self.request_indexing(post_url)
                results.append(result)
                time.sleep(1)  # Rate limiting between requests
        
        return results

    def process_all_domains(self) -> pd.DataFrame:
        """Process all domains and request indexing for non-indexed posts."""
        all_results = []
        
        for i in range(0, len(self.domains), self.batch_size):
            batch = self.domains[i:i + self.batch_size]
            current_batch = i // self.batch_size + 1
            total_batches = (len(self.domains) + self.batch_size - 1) // self.batch_size
            logger.info(f"Processing batch {current_batch} of {total_batches}")
            
            for domain in batch:
                try:
                    results = self.process_domain(domain)
                    all_results.extend(results)
                except Exception as e:
                    logger.error(f"Error processing domain {domain}: {e}")
                
                time.sleep(2)  # Delay between domains
            
            if current_batch < total_batches:
                logger.info("Waiting between batches...")
                time.sleep(10)  # Delay between batches
        
        return pd.DataFrame(all_results)

def main():
    GSC_CREDENTIALS_FILE = "path/to/your/credentials.json"  # Placeholder for credentials file path
    DOMAINS = [
        "example.com"  # Placeholder for domains Add your domain names here
    ]
    try:
        requester = WordPressIndexRequester(GSC_CREDENTIALS_FILE, DOMAINS, batch_size=5)
        results_df = requester.process_all_domains()
        
        if not results_df.empty:
            output_file = f"indexing_requests_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            results_df.to_csv(output_file, index=False)
            logger.info(f"Report generated successfully: {output_file}")
        else:
            logger.warning("No URLs were processed for indexing requests.")
            
    except Exception as e:
        logger.error(f"Script execution failed: {e}")
        raise
 
if __name__ == "__main__":
    main()
