In [1]:
import hashlib
import json
import os
import time
from typing import Dict, List, Tuple

import requests


class HighspotCrawler:
    def __init__(self, cookies: Dict[str, str], headers: Dict[str, str], record_file: str = "download_record.json"):
        self.cookies = cookies
        self.headers = headers
        self.base_url = "https://aws.highspot.com/api/v1"
        self.record_file = record_file
        self.record = self._load_record()

    def _load_record(self) -> Dict:
        """Load or initialize the record file"""
        try:
            with open(self.record_file, "r") as f:
                return json.load(f)
        except FileNotFoundError:
            return {"file_list": {}, "downloaded_files": {}}

    def _save_record(self):
        """Save the current record to file"""
        with open(self.record_file, "w") as f:
            json.dump(self.record, f, indent=2)

    def _get_file_hash(self, file_info: Dict) -> str:
        """Generate a hash for file info to detect changes"""
        # Using relevant fields that indicate file changes
        key_fields = ["id", "modified", "version"]
        hash_content = json.dumps({k: file_info.get(k) for k in key_fields}, sort_keys=True)
        return hashlib.md5(hash_content.encode()).hexdigest()

    def get_file_list(
        self, spot_id: str, incremental: bool = False, limit: int = 25, interval: float = 1.0
    ) -> List[Dict]:
        """
        Get list of files from a specific spot, handling pagination
        With incremental mode support
        Args:
            spot_id: The ID of the spot to fetch files from
            incremental: Whether to use incremental mode
            limit: Number of items per request
            interval: Time to wait between requests in seconds
        """
        all_items = []
        start = 0
        page = 1
        spot_records = self.record["file_list"].setdefault(spot_id, {})

        while True:
            print(f"Fetching page {page}, total files: {len(all_items)}")
            url = f"{self.base_url}/spots/{spot_id}/items"
            params = {
                "counts": "true",
                "include": "lists",
                "thumbnail": "small",
                "limit": limit,
                "resolve_links": "false",
                "list": "all",
                "buckets": "true",
                "sortby": "time_added",
                "start": start,
                "_": str(int(time.time() * 1000)),
            }

            response = requests.get(url, params=params, cookies=self.cookies, headers=self.headers, timeout=30)
            response.raise_for_status()
            items = response.json()["items"]

            if not items:
                print(f"Finished fetching. Total pages: {page-1}, total files: {len(all_items)}")
                break

            for item in items:
                file_hash = self._get_file_hash(item)
                if incremental and item["id"] in spot_records:
                    if spot_records[item["id"]]["hash"] == file_hash:
                        # Found unchanged file in incremental mode, stop fetching
                        print(f"Found unchanged file, stopping. Total pages: {page}, total files: {len(all_items)}")
                        return all_items

                # Update record with both hash and complete item data
                spot_records[item["id"]] = {"hash": file_hash, "data": item}
                all_items.append(item)

            # Save record after processing each batch of items
            self._save_record()
            start += limit
            page += 1

            # Add sleep between requests
            time.sleep(interval)

        return all_items

    def get_download_token(self, spot_id: str, item_id: str) -> str:
        """
        Get download token for a specific file
        """
        url = f"{self.base_url}/spots/{spot_id}/items/{item_id}/download/request_token"
        params = {"_": str(int(time.time() * 1000))}

        response = requests.get(url, params=params, cookies=self.cookies, headers=self.headers)
        response.raise_for_status()
        return response.json()["contentToken"]

    def download_file(self, spot_id: str, item_id: str, output_dir: str, incremental: bool = False) -> Tuple[str, bool]:
        """
        Download a file using its token, with incremental mode support
        Returns: (output_path, was_downloaded)
        """
        # Check if file was already downloaded in incremental mode
        spot_downloads = self.record["downloaded_files"].setdefault(spot_id, {})
        file_info = self.record["file_list"].get(spot_id, {}).get(item_id)
        if incremental and item_id in spot_downloads:
            if spot_downloads[item_id] == file_info["hash"]:
                # File already downloaded and unchanged
                return (os.path.join(output_dir, spot_downloads[f"{item_id}_path"]), False)
        # Get download token first
        token = self.get_download_token(spot_id, item_id)

        # Use token to download file
        download_url = "https://api.highspot.com/download"
        params = {"token": token}

        response = requests.get(download_url, params=params, cookies=self.cookies, headers=self.headers, timeout=30)
        response.raise_for_status()

        # Get filename from content name, otherwise use item_id
        filename = item_id
        if "contentName" in file_info["data"]:
            filename = file_info["data"]["contentName"]
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save file
        output_path = os.path.join(output_dir, filename)
        with open(output_path, "wb") as f:
            f.write(response.content)

        # Update download record
        spot_downloads[item_id] = file_info["hash"]
        spot_downloads[f"{item_id}_path"] = filename
        self._save_record()

        return (output_path, True)

    def get_all_saved_files(self, content_types: List[str] = ["Presentation", "PDF"]) -> List[Dict]:
        """
        Read and return all saved file information from the record file
        Args:
            content_types: List of content types to filter by (defaults to ['Presentation', 'PDF'])
        Returns: List of file data dictionaries
        """
        all_files = []
        for files in self.record["file_list"].values():
            files_data = [file_info["data"] for file_info in files.values() if "data" in file_info]
            if content_types:
                files_data = [f for f in files_data if f.get("contentType") in content_types]
            all_files.extend(files_data)
        return all_files

In [2]:
cookies = {
    'viewer2': 'v2-eyJ1dWlkIjoiNDMxYTAxM2UtMTkwZi00MWYzLTk5YTgtZmYyNGFmMzg1NTBk%0AIn0%3D%0A',
    'hs_app': '1',
    '_gcl_au': '1.1.1288311553.1737363875',
    '__adroll_fpc': '4fa6a1a67f7b059f226f53adaad22ea6-1737363875813',
    '_fbp': 'fb.1.1737363876059.8757533326447209',
    '_biz_uid': '0ad40fd8e8924341a340450a9f4483b7',
    '_biz_nA': '2',
    '_biz_flagsA': '%7B%22Version%22%3A1%2C%22ViewThrough%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D',
    '_biz_pendingA': '%5B%5D',
    '__q_state_JAoFmz9jHzhzN6Qe': 'eyJ1dWlkIjoiYTc1OGVmNTMtN2JjNi00MmIyLWJhZDYtNjg2MjQzZGQwOGVjIiwiY29va2llRG9tYWluIjoiaGlnaHNwb3QuY29tIiwiYWN0aXZlU2Vzc2lvbklkIjpudWxsLCJzdGF0ZUJ5U2NyaXB0SWQiOnsiMTU0NDM0MDI4MjU5NDQ4NDkxMyI6eyJkaXNtaXNzZWQiOmZhbHNlLCJzZXNzaW9uSWQiOm51bGx9fSwibWVzc2VuZ2VyRXhwYW5kZWQiOmZhbHNlLCJwcm9tcHREaXNtaXNzZWQiOmZhbHNlLCJjb252ZXJzYXRpb25JZCI6IjE1NzM2ODIzMDAyNTY1NTk3NTIifQ==',
    '_ga': 'GA1.1.1019504351.1737363876',
    '_mkto_trk': 'id:623-OHN-043&token:_mch-highspot.com-5265284898d0d65b12cd5462f56f3be',
    '_vwo_uuid_v2': 'D205F351421B5E39926B26DBFD458C474|17b03b492ee16dbb74f279a13f1e5129',
    '_vis_opt_s': '1%7C',
    '_vis_opt_test_cookie': '1',
    '_vwo_uuid': 'D205F351421B5E39926B26DBFD458C474',
    '_vwo_ds': '3%3Aa_0%2Ct_0%3A0%241737363895%3A99.68909128%3A%3A9_0%2C8_0%2C5_0%2C4_0%2C3_0%2C2_0%3A76_0%2C12_0%2C3_0%2C2_0%3A0',
    '_ga_GKWMC1GJF4': 'GS1.1.1737363875.1.1.1737365214.0.0.2144609614',
    'rack.session2': 'eyJzZXNzaW9uX2lkIjoiMWE5MzczY2FiNDQwN2EyYWI3NDFlMTAwZDYwMjk3MzQ1NjExZGQ0ZGEzOWYyN2MzM2I2NWM0YjU5NjU3ZDgxYyIsIl9mbGFzaCI6e30sIm51dGVsbGEiOiI1ZjExNTgwMTYyOGJhMjU1M2RmM2UxNjUiLCJzdSI6InN1MCIsInRva2VuIjoiMjo6MTQ2ZGYyNGQtOTc1Ny00NzhhLWJkNTctZGRmZjI2NGJkODJiOjAiLCJjc3JmIjoid3JaakNRV1EySnh2dWpnLTFlWmcwejU2a2p0aUkwQ3FvNGhQaktkdFdRNCJ9--deb5e9f98d8cfa8f13b4e7aa9f7cda1c5c652c83',
    '__reveal_ut': 'f1d6aab8-2cea-4de1-576a-e76e6573a03d',
    'amp_27c1db': 'aN7IX7YKCySxO8cjiUztaq.ZGJjYWJlYjg5MjMzNTBkOTQxYTk=..1ii958k5t.1ii9593c2.r.o.1j',
}

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'HS-CSRF': 'VyS2Se5NQc9mmevoqquiR3L0296HI7ADs2hnPsss0ypR',
    'HS-USER-ID': '5f115801628ba2553df3e165',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    #'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjQ1MDM0MSIsImFwIjoiMjU5NDc2OSIsImlkIjoiZTZmN2QwNjViMGQwYjM4OSIsInRyIjoiOGFjOWNiYmJhNWY0ODBhY2U2NTU5ZWE4OGFmNTQxNjEiLCJ0aSI6MTczNzM2Mjk3NDM0MX19',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    #'traceparent': '00-8ac9cbbba5f480ace6559ea88af54161-e6f7d065b0d0b389-01',
    #'tracestate': '450341@nr=0-1-450341-2594769-e6f7d065b0d0b389----1737362974341',
}

crawler = HighspotCrawler(cookies, headers)

# Example usage
spot_id = "60bdbd9634d6be4dbd9ce328"
output_dir = "downloads"
incremental = True  # Set to False for full download

In [3]:
# Get file list
files = crawler.get_all_saved_files()
# Get file list
#files = crawler.get_file_list(spot_id, incremental=incremental)

In [None]:
# Download each file
from tqdm import tqdm
for file in tqdm(files):
    try:
        output_path, was_downloaded = crawler.download_file(
            spot_id, file["id"], output_dir, incremental=incremental
        )
        if was_downloaded:
            print(f"Downloaded: {output_path}")
        else:
            print(f"Skipped unchanged file: {output_path}")
    except Exception as e:
        print(f"Error downloading {file['id']}: {str(e)}")

In [28]:
import glob
file_list = glob.glob('pdf/*.pdf')
for row in file_list:
    if row[4:] in mapping1:
        sample = {
              "metadataAttributes": {
                "x-amz-bedrock-kb-source-uri": {
                  "value": {
                    "type": "STRING",
                    "stringValue": f"https://aws.highspot.com/items/{mapping1[row[4:]]}"
                  },
                  "includeForEmbedding": True
                },
                "updated_date": {
                  "value": {
                    "type": "NUMBER",
                    "numberValue": 20240205
                  },
                  "includeForEmbedding": True
                }
              }
            }
        json.dump(sample, open(row+'.metadata.json', 'w'))