In [2]:
import hashlib
import json
import os
import time
from typing import Dict, List, Tuple

import requests


class HighspotCrawler:
    def __init__(self, cookies: Dict[str, str], headers: Dict[str, str], record_file: str = "download_record.json"):
        self.cookies = cookies
        self.headers = headers
        self.base_url = "https://aws.highspot.com/api/v1"
        self.record_file = record_file
        self.record = self._load_record()

    def _load_record(self) -> Dict:
        """Load or initialize the record file"""
        try:
            with open(self.record_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            return {"file_list": {}, "downloaded_files": {}}

    def _save_record(self):
        """Save the current record to file"""
        with open(self.record_file, "w") as f:
            json.dump(self.record, f, indent=2)

    def _get_file_hash(self, file_info: Dict) -> str:
        """Generate a hash for file info to detect changes"""
        # Using relevant fields that indicate file changes
        key_fields = ["id", "modified", "version"]
        hash_content = json.dumps({k: file_info.get(k) for k in key_fields}, sort_keys=True)
        return hashlib.md5(hash_content.encode()).hexdigest()

    def get_file_list(
        self, spot_id: str, incremental: bool = False, limit: int = 25, interval: float = 1.0
    ) -> List[Dict]:
        """
        Get list of files from a specific spot, handling pagination
        With incremental mode support
        Args:
            spot_id: The ID of the spot to fetch files from
            incremental: Whether to use incremental mode
            limit: Number of items per request
            interval: Time to wait between requests in seconds
        """
        all_items = []
        start = 0
        page = 1
        spot_records = self.record["file_list"].setdefault(spot_id, {})

        while True:
            print(f"Fetching page {page}, total files: {len(all_items)}")
            url = f"{self.base_url}/spots/{spot_id}/items"
            params = {
                "counts": "true",
                "include": "lists",
                "thumbnail": "small",
                "limit": limit,
                "resolve_links": "false",
                "list": "all",
                "buckets": "true",
                "sortby": "time_added",
                "start": start,
                "_": str(int(time.time() * 1000)),
            }

            response = requests.get(url, params=params, cookies=self.cookies, headers=self.headers, timeout=30)
            response.raise_for_status()
            items = response.json()["items"]

            if not items:
                print(f"Finished fetching. Total pages: {page-1}, total files: {len(all_items)}")
                break

            for item in items:
                file_hash = self._get_file_hash(item)
                if incremental and item["id"] in spot_records:
                    if spot_records[item["id"]]["hash"] == file_hash:
                        # Found unchanged file in incremental mode, stop fetching
                        print(f"Found unchanged file, stopping. Total pages: {page}, total files: {len(all_items)}")
                        return all_items

                # Update record with both hash and complete item data
                spot_records[item["id"]] = {"hash": file_hash, "data": item}
                all_items.append(item)

            # Save record after processing each batch of items
            self._save_record()
            start += limit
            page += 1

            # Add sleep between requests
            time.sleep(interval)

        return all_items

    def get_download_token(self, spot_id: str, item_id: str) -> str:
        """
        Get download token for a specific file
        """
        url = f"{self.base_url}/spots/{spot_id}/items/{item_id}/download/request_token"
        params = {"_": str(int(time.time() * 1000))}

        response = requests.get(url, params=params, cookies=self.cookies, headers=self.headers)
        response.raise_for_status()
        return response.json()["contentToken"]

    def download_file(self, spot_id: str, item_id: str, output_dir: str, incremental: bool = False) -> Tuple[str, bool]:
        """
        Download a file using its token, with incremental mode support
        Returns: (output_path, was_downloaded)
        """
        # Check if file was already downloaded in incremental mode
        spot_downloads = self.record["downloaded_files"].setdefault(spot_id, {})
        file_info = self.record["file_list"].get(spot_id, {}).get(item_id)
        if incremental and item_id in spot_downloads:
            if spot_downloads[item_id] == file_info["hash"]:
                # File already downloaded and unchanged
                return (os.path.join(output_dir, spot_downloads[f"{item_id}_path"]), False)
        # Get download token first
        token = self.get_download_token(spot_id, item_id)

        # Use token to download file
        download_url = "https://api.highspot.com/download"
        params = {"token": token}

        response = requests.get(download_url, params=params, cookies=self.cookies, headers=self.headers, timeout=30)
        response.raise_for_status()

        # Get filename from content name, otherwise use item_id
        filename = item_id
        if "contentName" in file_info["data"]:
            filename = file_info["data"]["contentName"]
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save file
        output_path = os.path.join(output_dir, filename)
        with open(output_path, "wb") as f:
            f.write(response.content)

        # Update download record
        spot_downloads[item_id] = file_info["hash"]
        spot_downloads[f"{item_id}_path"] = filename
        self._save_record()

        return (output_path, True)

    def get_all_saved_files(self, content_types: List[str] = ["Presentation", "PDF"]) -> List[Dict]:
        """
        Read and return all saved file information from the record file
        Args:
            content_types: List of content types to filter by (defaults to ['Presentation', 'PDF'])
        Returns: List of file data dictionaries
        """
        all_files = []
        for files in self.record["file_list"].values():
            files_data = [file_info["data"] for file_info in files.values() if "data" in file_info]
            if content_types:
                files_data = [f for f in files_data if f.get("contentType") in content_types]
            all_files.extend(files_data)
        return all_files

In [None]:
cookies = {

}

headers = {

}

crawler = HighspotCrawler(cookies, headers)

# Example usage
spot_id = "60bdbd9634d6be4dbd9ce328"
output_dir = "downloads"
incremental = True  # Set to False for full download

In [5]:
# Get file list
files = crawler.get_all_saved_files()
# Get file list
# files = crawler.get_file_list(spot_id, incremental=incremental)

In [6]:
# Download each file
from tqdm import tqdm
for file in tqdm(files):
    try:
        output_path, was_downloaded = crawler.download_file(
            spot_id, file["id"], output_dir, incremental=incremental
        )
        if was_downloaded:
            print(f"Downloaded: {output_path}")
        else:
            print(f"Skipped unchanged file: {output_path}")
    except Exception as e:
        print(f"Error downloading {file['id']}: {str(e)}")

  0%|          | 1/911 [00:08<2:10:54,  8.63s/it]

Downloaded: downloads/Zilliz Cloud架构图.pptx


  0%|          | 2/911 [00:15<1:51:33,  7.36s/it]

Downloaded: downloads/Zilliz Cloud方案介绍_详细版.pdf


  0%|          | 3/911 [00:20<1:35:12,  6.29s/it]

Downloaded: downloads/Zilliz Cloud方案介绍_精简版.pdf


  0%|          | 4/911 [00:33<2:20:09,  9.27s/it]

Downloaded: downloads/Dify Innovate 2024演讲材料.pptx


  1%|          | 5/911 [00:47<2:42:34, 10.77s/it]

Downloaded: downloads/Dify架构图.pptx


  1%|          | 6/911 [00:51<2:10:05,  8.62s/it]

Downloaded: downloads/2025 Industry Credit Introduction.pptx


  1%|          | 7/911 [00:55<1:44:46,  6.95s/it]

Downloaded: downloads/voc_architecture.pptx


  1%|          | 8/911 [01:52<5:44:23, 22.88s/it]

Downloaded: downloads/Medical Insights Hub[v1.5.1] FCD - en.pptx


  1%|          | 8/911 [02:28<4:39:54, 18.60s/it]


KeyboardInterrupt: 

In [28]:
import glob
file_list = glob.glob('pdf/*.pdf')
for row in file_list:
    if row[4:] in mapping1:
        sample = {
              "metadataAttributes": {
                "x-amz-bedrock-kb-source-uri": {
                  "value": {
                    "type": "STRING",
                    "stringValue": f"https://aws.highspot.com/items/{mapping1[row[4:]]}"
                  },
                  "includeForEmbedding": True
                },
                "updated_date": {
                  "value": {
                    "type": "NUMBER",
                    "numberValue": 20240205
                  },
                  "includeForEmbedding": True
                }
              }
            }
        json.dump(sample, open(row+'.metadata.json', 'w'))