In [None]:
#@title Website Cache Warmer Settings

#@markdown ### Enter the website URL you want to warm the cache for:
url = "https://riaa.webflow.io/" #@param {type:"string"}

#@markdown ---
#@markdown ### Configure crawling behavior:
concurrency = 11 #@param {type:"slider", min:1, max:50, step:5}
delay = 1.4 #@param {type:"slider", min:0.1, max:3.0, step:0.1}
max_pages = 2501 #@param {type:"slider", min:1, max:2501, step:50}
update_sheet_interval = 45 #@param {type:"slider", min:5, max:505, step:10}

#@markdown ---
#@markdown ### Additional Options:
use_sitemap = True #@param {type:"boolean"}
verbose = False #@param {type:"boolean"}
find_links = False #@param {type:"boolean"}
continue_from_last = False #@param {type:"boolean"}

#@markdown ---
#@markdown Exclude Paths: Skip URLs containing these paths (comma-separated, e.g. "admin,draft,test")
exclude_paths = "" #@param {type:"string"}
#@markdown Include Paths: Only process URLs containing these paths (comma-separated, e.g. "blog,products")
include_paths = "" #@param {type:"string"}


In [None]:
# @title
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import concurrent.futures
import xml.etree.ElementTree as ET
from io import BytesIO
import gzip
import random
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import os
from datetime import datetime
from urllib.parse import urlparse
from google.colab import auth
import gspread
from google.auth import default
from datetime import datetime
from urllib.parse import urlparse
import psutil
from datetime import datetime

def log_system_metrics():
    cpu = psutil.cpu_percent(interval=1)
    mem = psutil.virtual_memory()
    print(f"[{datetime.now().strftime('%H:%M:%S')}] CPU: {cpu}%, Memory: {mem.percent}% used")


try:
    from tqdm.notebook import tqdm
except ImportError:
    print("For better visualization, install tqdm: pip install tqdm")
    # Fallback if tqdm not available
    class tqdm:
        def __init__(self, total, desc):
            self.total = total
            self.desc = desc
            self.n = 0
            print(f"{desc}: 0/{total}")

        def update(self, n):
            self.n += na
            print(f"\r{self.desc}: {self.n}/{self.total}", end="")

        def close(self):
            print()

class SiteCacheWarmer:
    def __init__(self, start_url, concurrency=5, delay=0.5, max_pages=100, verbose=True,
                 use_sitemap=True, max_retries=3, exclude_paths=None, include_paths=None, find_links=True,
                 update_interval=50, continue_from_last=False):
        self.start_url = start_url
        self.base_domain = urlparse(start_url).netloc
        self.visited_urls = set()
        self.queue = [start_url]
        self.concurrency = concurrency
        self.delay = delay
        self.max_pages = max_pages
        self.verbose = verbose
        self.use_sitemap = use_sitemap
        self.max_retries = max_retries
        self.exclude_paths = exclude_paths
        self.include_paths = include_paths
        self.find_links = find_links
        self.retry_counts = {}
        self.page_times = {}
        self.status_codes = {}
        self.progress_bar = None
        self.url_from_sitemap = {}
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
        }
        self.update_interval = update_interval
        self.last_updated_count = 0
        self.gs_headers_written = False
        self.worksheet = None
        self.continue_from_last = continue_from_last

    def _init_google_sheet(self):
      auth.authenticate_user()
      creds, _ = default()
      gc = gspread.authorize(creds)
      from googleapiclient.discovery import build
      drive_service = build('drive', 'v3', credentials=creds)
      self.drive_service = drive_service
      central_folder_id = "1Om8LfyP7WVRGT07KK4ggn5CA7-HfjlaF"
      query = f"'{central_folder_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and name = '{self.base_domain}'"
      response = drive_service.files().list(q=query, fields="files(id, name)").execute()
      folders = response.get('files', [])
      if folders:
          domain_folder_id = folders[0]['id']
      else:
          folder_metadata = {
              'name': self.base_domain,
              'mimeType': 'application/vnd.google-apps.folder',
              'parents': [central_folder_id]
          }
          folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
          domain_folder_id = folder.get('id')
          print(f"Created domain subfolder: {self.base_domain} (ID: {domain_folder_id})")
      timestamp = datetime.now().strftime("%Y%m%d-%H%M")
      sheet_name = f"{self.base_domain} - {timestamp}"
      file_metadata = {
          'name': sheet_name,
          'mimeType': 'application/vnd.google-apps.spreadsheet',
          'parents': [domain_folder_id]
      }
      file = drive_service.files().create(
          body=file_metadata,
          fields='id, webViewLink',
          supportsAllDrives=True
      ).execute()
      print("Google Sheet created:", file.get('webViewLink'))
      spreadsheet = gc.open_by_key(file.get('id'))
      self.worksheet = spreadsheet.get_worksheet(0)

    def _load_last_google_sheet(self):
      auth.authenticate_user()
      creds, _ = default()
      from googleapiclient.discovery import build
      drive_service = build('drive', 'v3', credentials=creds)
      central_folder_id = "1Om8LfyP7WVRGT07KK4ggn5CA7-HfjlaF"
      query_folder = f"'{central_folder_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and name = '{self.base_domain}'"
      response_folder = drive_service.files().list(q=query_folder, fields="files(id, name)").execute()
      folders = response_folder.get('files', [])
      if not folders:
          print("No domain subfolder found. Starting fresh.")
          return
      domain_folder_id = folders[0]['id']
      query = f"'{domain_folder_id}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
      response = drive_service.files().list(q=query, orderBy="createdTime desc", pageSize=1, fields="files(id, name)").execute()
      files = response.get('files', [])
      if not files:
          print("No previous Google Sheet found for this domain. Starting fresh.")
          return
      latest_file = files[0]
      gc = gspread.authorize(creds)
      spreadsheet = gc.open_by_key(latest_file['id'])
      worksheet = spreadsheet.get_worksheet(0)
      data = worksheet.get_all_values()
      if not data or len(data) < 2:
          print("Google Sheet is empty. Starting fresh.")
          return
      header = data[0]
      url_index = header.index("URL")
      loaded_urls = set(row[url_index] for row in data[1:] if row[url_index])
      self.visited_urls = loaded_urls
      self.last_updated_count = len(data) - 1
      print(f"Resuming from previous run: Loaded {len(loaded_urls)} visited URLs from sheet '{latest_file['name']}'.")

    def _update_google_sheet_incremental(self):
      data = {
          'URL': list(self.visited_urls),
          'Load Time (s)': [self.page_times.get(url) for url in self.visited_urls],
          'Status': [str(self.status_codes.get(url)) for url in self.visited_urls],
          'Retries': [self.retry_counts.get(url, 0) for url in self.visited_urls],
          'URL Source': [("Sitemap" if self.url_from_sitemap.get(url, False) else "Discovered") for url in self.visited_urls]
      }
      df = pd.DataFrame(data)
      df = df.sort_values('URL')
      new_df = df.iloc[self.last_updated_count:]
      new_df = new_df.where(pd.notnull(new_df), None)
      if not new_df.empty:
          rows = new_df.values.tolist()
          if not self.gs_headers_written:
              header = new_df.columns.tolist()
              self.worksheet.append_row(header)
              self.gs_headers_written = True
          self.worksheet.append_rows(rows, value_input_option='USER_ENTERED')
          self.last_updated_count = len(df)
          print(f"Incremental update: Appended {len(rows)} rows. Total processed: {self.last_updated_count}")


    def print_if_verbose(self, message):
        if self.verbose:
            print(message)

    def parse_sitemap(self, base_url, exclude_paths=None, include_paths=None):
        try:
            sitemap_url = urljoin(base_url, 'sitemap.xml')
            print(f"Attempting to parse sitemap from: {sitemap_url}")
            response = requests.get(sitemap_url, headers=self.headers, timeout=30)
            if response.status_code != 200:
                print(f"Failed to get sitemap (status {response.status_code}). Falling back to normal crawling.")
                return []
            content_type = response.headers.get('Content-Type', '')
            if 'gzip' in content_type or 'application/x-gzip' in content_type:
                content = gzip.decompress(response.content)
            else:
                content = response.content
            root = ET.parse(BytesIO(content)).getroot()
            urls = []
            namespaces = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            url_elements = root.findall('.//sm:url', namespaces)
            if not url_elements:
                url_elements = root.findall('.//url')
            for url_elem in url_elements:
                loc = url_elem.find('.//sm:loc', namespaces)
                if loc is None:
                    loc = url_elem.find('.//loc')
                if loc is not None and loc.text:
                    full_url = loc.text.strip()
                    parsed_url = urlparse(full_url)
                    path = parsed_url.path
                    if exclude_paths and any(excluded in path for excluded in exclude_paths):
                        continue
                    if include_paths:
                        if not any(included in path for included in include_paths):
                            continue
                    urls.append(full_url)
            print(f"Successfully found {len(urls)} URLs in sitemap after filtering.")
            return urls
        except Exception as e:
            print(f"Error parsing sitemap: {str(e)}. Falling back to normal crawling.")
            return []

    def is_valid_url(self, url):
        if not url.startswith('http'):
            return False
        parsed_url = urlparse(url)
        if parsed_url.netloc != self.base_domain:
            return False
        if not parsed_url.path or parsed_url.path == '/':
            if parsed_url.query == '' and self.start_url in self.visited_urls:
                return False
        extensions_to_skip = ['.jpg', '.jpeg', '.png', '.gif', '.pdf', '.zip', '.css', '.js']
        if any(url.lower().endswith(ext) for ext in extensions_to_skip):
            return False
        return True

    def extract_links(self, html, base_url):
        soup = BeautifulSoup(html, 'html.parser')
        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(base_url, href)
            parsed = urlparse(full_url)
            clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
            if parsed.query:
                clean_url += f"?{parsed.query}"
            if self.is_valid_url(clean_url):
                links.append(clean_url)
        return links

    def visit_url(self, url):
        if url not in self.retry_counts:
            self.retry_counts[url] = 0
        current_try = self.retry_counts[url] + 1
        try:
            if current_try > 1 and self.verbose:
                print(f"Retry #{current_try-1} for {url}")
            start_time = time.time()
            response = requests.get(url, headers=self.headers, timeout=15)
            duration = time.time() - start_time
            self.page_times[url] = duration
            self.status_codes[url] = response.status_code
            if self.verbose:
                print(f"✓ {url} - Status: {response.status_code}, Time: {duration:.2f}s")
            if 'text/html' in response.headers.get('Content-Type', ''):
                if response.status_code == 200:
                    links = self.extract_links(response.text, url) if self.find_links else []
                    return url, links
            return url, []
        except requests.exceptions.Timeout:
            self.retry_counts[url] += 1
            if self.retry_counts[url] < self.max_retries:
                if self.verbose:
                    print(f"⚠ Timeout for {url}. Will retry ({self.retry_counts[url]}/{self.max_retries})...")
                return url, [url]
            self.page_times[url] = None
            self.status_codes[url] = 'Timeout'
            if self.verbose:
                print(f"✗ Timeout accessing {url} after {self.max_retries} tries.")
            return url, []
        except Exception as e:
            self.page_times[url] = None
            self.status_codes[url] = f'Error: {type(e).__name__}'
            if self.verbose:
                print(f"✗ Error accessing {url}: {str(e)}")
            return url, []

    def warm_cache(self):
        print(f"Starting cache warming from {self.start_url}")
        print(f"Configuration: concurrency={self.concurrency}, delay={self.delay}s, max_pages={self.max_pages}, max_retries={self.max_retries}")
        if self.exclude_paths:
            print(f"Excluding paths containing: {', '.join(self.exclude_paths)}")
        if self.include_paths:
            print(f"Only including paths containing: {', '.join(self.include_paths)}")
        if self.use_sitemap:
            print("Attempting to load URLs from sitemap.xml...")
            sitemap_urls = self.parse_sitemap(self.start_url, self.exclude_paths, self.include_paths)
            if sitemap_urls:
                self.queue = []
                for url in sitemap_urls[:self.max_pages]:
                    self.queue.append(url)
                    self.url_from_sitemap[url] = True
                print(f"Using {len(self.queue)} URLs from sitemap.xml")
        self._init_google_sheet()
        self.progress_bar = tqdm(total=min(len(self.queue), self.max_pages), desc="Warming cache")
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrency) as executor:
            while self.queue and len(self.visited_urls) < self.max_pages:
                batch_size = min(self.concurrency, len(self.queue))
                batch = [self.queue.pop(0) for _ in range(batch_size)]
                batch_set = set(batch)
                new_urls = [url for url in batch if url not in self.retry_counts or self.retry_counts[url] == 0]
                self.visited_urls.update(new_urls)
                future_to_url = {executor.submit(self.visit_url, url): url for url in batch}
                for future in concurrent.futures.as_completed(future_to_url):
                    url, new_links = future.result()
                    retry_links = []
                    other_links = []
                    for link in new_links:
                        if link == url and link in self.retry_counts and self.retry_counts[link] > 0:
                            retry_links.append(link)
                        elif link not in self.visited_urls and link not in self.queue:
                            other_links.append(link)
                    if retry_links:
                        self.queue = retry_links + self.queue
                    if other_links:
                        for link in other_links:
                            self.queue.append(link)
                            if link not in self.url_from_sitemap:
                                self.url_from_sitemap[link] = False
                    if url not in batch_set or url not in self.retry_counts or self.retry_counts[url] == 0:
                        self.progress_bar.update(1)
                    if self.delay > 0:
                        random_modifier = 2 * random.random() - 1
                        actual_delay = max(self.delay * (1 + random_modifier), self.delay * 0.2)
                        time.sleep(actual_delay)
                if len(self.visited_urls) - self.last_updated_count >= self.update_interval:
                    self._update_google_sheet_incremental()
                log_system_metrics()
        self.progress_bar.close()
        print(f"\nCache warming completed!")
        print(f"Visited {len(self.visited_urls)} pages.")
        self._update_google_sheet_incremental()
        return self.display_results()

    def _save_results_to_csv(self, results_df):
        import os
        from datetime import datetime
        from urllib.parse import urlparse
        domain = urlparse(self.start_url).netloc
        timestamp = datetime.now().strftime("%Y%m%d-%H%M")
        try:
            drive_folder_path = os.path.join('/content/drive/Shared drives/Operations/Cache Warmer/', domain)
            os.makedirs(drive_folder_path, exist_ok=True)
            drive_file_path = os.path.join(drive_folder_path, f"{timestamp}.csv")
            results_df.to_csv(drive_file_path, index=False)
            print(f"Results saved to Google Drive: {drive_file_path}")
        except Exception as e:
            print(f"Error saving to Google Drive: {str(e)}")
        local_file_path = f"cache_warming_{domain}_{timestamp}.csv"
        results_df.to_csv(local_file_path, index=False)

    def _save_results_to_google_sheet(self, results_df):
        from google.colab import auth
        import gspread
        from google.auth import default
        from datetime import datetime
        from urllib.parse import urlparse
        domain = urlparse(self.start_url).netloc
        timestamp = datetime.now().strftime("%Y%m%d-%H%M")
        try:
            auth.authenticate_user()
            creds, _ = default()
            gc = gspread.authorize(creds)
            sheet_name = f"{domain} - {timestamp}"
            operations_folder_id = "1Om8LfyP7WVRGT07KK4ggn5CA7-HfjlaF"
            from googleapiclient.discovery import build
            drive_service = build('drive', 'v3', credentials=creds)
            file_metadata = {
                'name': sheet_name,
                'mimeType': 'application/vnd.google-apps.spreadsheet',
                'parents': [operations_folder_id]
            }
            file = drive_service.files().create(
                body=file_metadata,
                fields='id, webViewLink',
                supportsAllDrives=True
            ).execute()
            spreadsheet = gc.open_by_key(file.get('id'))
            worksheet = spreadsheet.get_worksheet(0)
            data = [results_df.columns.tolist()]
            data.extend(results_df.values.tolist())
            worksheet.update(data)
            print(f"Results saved to Google Sheet: {file.get('webViewLink')}")
            return file.get('webViewLink')
        except Exception as e:
            print(f"Error saving to Google Sheet: {str(e)}")
            return None

    def display_results(self):
        if not self.page_times:
            return pd.DataFrame()
        data = {
            'URL': list(self.visited_urls),
            'Load Time (s)': [self.page_times.get(url) for url in self.visited_urls],
            'Status': [str(self.status_codes.get(url)) for url in self.visited_urls],
            'Retries': [self.retry_counts.get(url, 0) for url in self.visited_urls],
            'URL Source': [("Sitemap" if self.url_from_sitemap.get(url, False) else "Discovered") for url in self.visited_urls]
        }
        df = pd.DataFrame(data)
        df = df.sort_values('Load Time (s)', ascending=False)
        df_plot = df[df['Load Time (s)'].notna()]
        if not df_plot.empty:
            plt.figure(figsize=(15, 10))
            plt.subplot(2, 2, 1)
            plt.hist(df_plot['Load Time (s)'], bins=20, color='skyblue', edgecolor='black')
            plt.title('Page Load Time Distribution')
            plt.xlabel('Load Time (seconds)')
            plt.ylabel('Number of Pages')
            plt.subplot(2, 2, 2)
            status_counts = df['Status'].value_counts()
            plt.pie(status_counts, labels=status_counts.index, autopct='%1.1f%%')
            plt.title('HTTP Status Codes')
            plt.subplot(2, 2, 3)
            retry_counts = df['Retries'].value_counts().sort_index()
            if not retry_counts.empty:
                plt.bar(retry_counts.index.astype(str), retry_counts.values, color='salmon')
                plt.title('Retry Distribution')
                plt.xlabel('Number of Retries')
                plt.ylabel('Number of Pages')
            if 'Retries' in df.columns and any(df['Retries'] > 0):
                plt.subplot(2, 2, 4)
                successful_retries = df[(df['Retries'] > 0) & (df['Load Time (s)'].notna())]
                if not successful_retries.empty:
                    plt.text(0.5, 0.5,
                             f"Pages requiring retries: {len(df[df['Retries'] > 0])}\n" +
                             f"Successfully loaded after retry: {len(successful_retries)}\n" +
                             f"Pages that failed all retries: {len(df[(df['Retries'] >= self.max_retries) & (df['Load Time (s)'].isna())])}",
                             horizontalalignment='center',
                             verticalalignment='center',
                             fontsize=12,
                             transform=plt.gca().transAxes)
                plt.title('Retry Statistics')
                plt.axis('off')
            plt.tight_layout()
            plt.show()
        print("\nTop 10 Slowest Pages:")
        display(df.head(10))
        if 'Retries' in df.columns and any(df['Retries'] > 0):
            print("\nPages that Required Retries:")
            display(df[df['Retries'] > 0])
        self._save_results_to_csv(df)
        sheet_url = self._save_results_to_google_sheet(df)
        return df

def run_cache_warmer(url, concurrency=5, delay=0.5, max_pages=100, use_sitemap=True,
                     max_retries=3, exclude_paths=None, include_paths=None, find_links=True,update_interval=50,
        continue_from_last=False):
    """Run the cache warmer

    Args:
        url (str): The website URL to warm cache for
        concurrency (int): Number of concurrent requests
        delay (float): Delay between requests in seconds
        max_pages (int): Maximum number of pages to crawl
        use_sitemap (bool): Whether to use sitemap.xml instead of crawling
        max_retries (int): Number of times to retry pages that timeout
        exclude_paths (list): List of path segments to exclude (e.g. ['library', 'admin'])
        include_paths (list): List of path segments to specifically include (e.g. ['about-us'])
        find_links (bool): Whether to extract links from each page

    Returns:
        pandas.DataFrame: Results of the cache warming process
    """
    warmer = SiteCacheWarmer(
        url,
        concurrency=concurrency,
        delay=delay,
        max_pages=max_pages,
        verbose=verbose,
        use_sitemap=use_sitemap,
        max_retries=max_retries,
        exclude_paths=exclude_paths,
        include_paths=include_paths,
        find_links=find_links,
        update_interval=update_interval,
        continue_from_last=continue_from_last
    )

    return warmer.warm_cache()

exclude_list = [x.strip() for x in exclude_paths.split(",")] if exclude_paths else None
include_list = [x.strip() for x in include_paths.split(",")] if include_paths else None

results = run_cache_warmer(
    url=url,
    concurrency=concurrency,
    delay=delay,
    max_pages=max_pages,
    use_sitemap=use_sitemap,
    max_retries=3,
    exclude_paths=exclude_list,
    include_paths=include_list,
    find_links=find_links,
    update_interval=update_sheet_interval,
    continue_from_last=continue_from_last
)