In [2]:
!pip install openai

Collecting openai
  Using cached openai-1.52.0-py3-none-any.whl.metadata (24 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.6.2.post1-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.6.1-cp312-none-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from ht

In [7]:
import os
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import html2text
import hashlib
import shutil
import readability

# Set of visited URLs to prevent infinite recursion
visited_urls = set()

def download_page(url):
    """
    Downloads the content of a web page from the given URL.

    Args:
        url (str): The URL of the web page to download.

    Returns:
        str: The content of the web page as a string, or None if there was an error.

    Raises:
        requests.RequestException: If there was an error while downloading the web page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an error for bad status codes
        return response.text
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

def extract_urls(html, base_url, ignored_extensions = ['.txt', '.pdf', '.docx']):
    """
    Extracts all URLs from the given HTML content, resolving relative URLs and ignoring hash fragments.

    Args:
        html (str): The HTML content to extract URLs from.
        base_url (str): The base URL used to resolve relative URLs.

    Returns:
        set: A set of URLs extracted from the HTML content.
    """
    soup = BeautifulSoup(html, 'html.parser')
    urls = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Resolve relative URLs and filter by hash fragment
        full_url = urljoin(base_url, href.split('#', 1)[0])
        # Ignore URLs ending with specific file extensions
        if any(full_url.endswith(ext) for ext in ignored_extensions):
            continue
        if urlparse(full_url).netloc == urlparse(base_url).netloc:
            urls.add(full_url)
    return urls



def html_to_markdown(html):
    """
    Converts HTML content to Markdown format.

    Parameters:
    html (str): The HTML content to be converted.

    Returns:
    str: The Markdown representation of the HTML content.
    """
    # Using readability to extract the main content
    document = readability.Document(html)
    summary = document.summary()

    converter = html2text.HTML2Text()
    converter.ignore_links = False
    return converter.handle(summary)

def save_markdown(markdown, folder, filename):
    if not os.path.exists(folder):
        os.makedirs(folder)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(markdown)


def generate_filename(url, base_url):
    """
    Generate a filename based on the given URL and base URL.

    Args:
        url (str): The URL from which the filename will be generated.
        base_url (str): The base URL used to remove the common path from the URL.

    Returns:
        str: The generated filename.

    """
    # Parse the URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Remove the base URL path to get the unique part of the path
    base_path = parsed_base_url.path.strip('/')
    unique_path = parsed_url.path.strip('/')

    # If the base_path is not empty, remove it from the start of unique_path
    if base_path and unique_path.startswith(base_path):
        unique_path = unique_path[len(base_path):].strip('/')

    # Split the path into segments and join them with hyphens
    if unique_path:
        filename = unique_path.replace('/', '-').lower() + ".md"
    else:
        filename = "index.md"
    return filename


def scrape_site(url, base_url, base_folder=''):
    """
    Scrapes a website recursively, saving the content as markdown files.

    Args:
        url (str): The URL of the website to scrape.
        base_url (str): The base URL of the website.
        base_folder (str, optional): The base folder to save the markdown files. Defaults to ''.

    Returns:
        None
    """
    # Ensure the URL starts with the base URL
    if not url.startswith(base_url):
        return

    if url in visited_urls or urlparse(url).netloc != urlparse(base_url).netloc:
        return
    visited_urls.add(url)

    print(f"Scraping {url}")
    html = download_page(url)
    if html:
        markdown = html_to_markdown(html)
        filename = generate_filename(url, base_url)
        folder = os.path.join(base_folder, urlparse(base_url).netloc)
        save_markdown(markdown, folder, filename)

        for link in extract_urls(html, url):
            scrape_site(link, base_url, base_folder)



def clean_directory(folder):
    """
    Deletes all files and folders in the specified directory.

    Args:
        folder (str): The path to the directory to be cleaned.

    Raises:
        OSError: If there is an error while deleting files or folders.

    """
    if os.path.exists(folder):
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')

# Example usage

base_url = 'https://vectorbt.pro/'  # Change this URL to your target
url_secret = os.getenv('VBT_PRO_SECRET_URL') # '5af3d00f'
print(f"Secret URL: {url_secret}")
start_url = f'{base_url}{url_secret}/'  # Change this URL to your target
base_folder = 'output'

clean_directory(base_folder)
scrape_site(start_url, start_url, base_folder)
print("Scraping complete.")

NameError: name '__file__' is not defined

In [3]:
import openai
import os
from datetime import date
import hashlib
import time

# Set OpenAI API key as an environment variable
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
VECTORBT_PRO_SECRET_URL = os.getenv("VBT_PRO_SECRET_URL")

# Initialize the OpenAI client
client = openai.Client(api_key=OPENAI_API_KEY)

# Create a date-based SEMVER version for the Vector Store's name
def generate_version_tag():
    today = date.today()
    timestamp = int(time.time())
    commit_hash = hashlib.sha1(str(timestamp).encode()).hexdigest()[:4].upper()
    version_tag = VECTORBT_PRO_SECRET_URL + "_" + today.strftime("%m.%d.%Y")
    return version_tag

version_tag = generate_version_tag()

assistant_name = f"QuantGPT {version_tag}"

assistant_instructions = f"""You are a helpful assistant that has a knowledge base uploaded to you containing information on how the closed-source VectorBT (PRO) Python library and its modules work for building financial backtests and simulations.

VectorBT PRO (vectorbtpro) is a next-generation engine for backtesting, algorithmic trading, and research. It's a high-performance, actively-developed, proprietary successor to the vectorbt library, one of the world's most innovative open-source backtesting packages. The PRO version extends the open-source package with new impressive features and useful enhancements.

You are an expert at reading through the provided VectorBT (PRO) documentation and coming up with clear, accurate answers to users' queries.

You have been given a massive index to search through which contains all of the text from VBT (PRO)'s documentation. If you cannot find/retrieve the answer in your vector store, you simply let the user know. Respond saying that you can't find any information on that topic specifically.

Also, FYI, VectorBT (PRO) can also be referred to in this context as VBT, so if VBT is mentioned in the messages, assume the user is referring to this closed source version, NOT the open source `vectorbt`. VectorBT PRO has been completely refactored to improve performance and enable new groundbreaking features, such as parallelization support, so many things are different from how the older, open source version worked."""

# Step 1: Create a new Assistant with File Search Enabled
assistant = client.beta.assistants.create(
    name=assistant_name,
    instructions=assistant_instructions,
    model="gpt-4o",
    temperature=0.40,
    # tools array Optional Defaults to [] A list of tool enabled on the assistant. There can be a maximum of 128 tools per assistant. Tools can be of types code_interpreter, file_search, or function.
    tools=[{"type": "file_search"}, {"type": "code_interpreter"}],
)
print(f"Assistant created successfully ✔")
print(f"Assistant Name: {assistant.name}")
print(f"Assistant ID: {assistant.id}")

# Step 2: Upload files and add them to a Vector Store
# Define the directories where files are located (there can be one or multiple)
directories = [

    './output/vectorbt.pro/', 

]

# Supported file extensions
supported_extensions = {
    '.c', '.cs', '.cpp', '.doc', '.docx', '.html', '.java', '.json', '.md', 
    '.pdf', '.php', '.pptx', '.py', '.rb', '.tex', '.txt', '.css', '.js', 
    '.sh', '.ts'
}

# Create a vector store 
vector_store = client.beta.vector_stores.create(name=assistant_name)

# Ready the files for upload to OpenAI
file_paths = [
    os.path.join(directory, filename) 
    for directory in directories
    for filename in os.listdir(directory) 
    if os.path.isfile(os.path.join(directory, filename)) and os.path.splitext(filename)[1] in supported_extensions
]

# Batch the file uploads
batch_size = 500
file_ids = []
for i in range(0, len(file_paths), batch_size):
    batch = file_paths[i:i+batch_size]
    file_streams = [open(path, "rb") for path in batch]
    file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=file_streams
    )
    
    # Print the entire file_batch object to understand its structure
    print(file_batch)
    print(file_batch.status)
    print(file_batch.file_counts)

    # If there are any errors, print them out
    if hasattr(file_batch, 'errors'):
        for error in file_batch.errors:
            print(f"Error uploading file {error.file}: {error.message}")

    # Close the file streams
    for file_stream in file_streams:
        file_stream.close()

# Step 3: Update the assistant to use the new Vector Store
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}
)
print(f"Assistant updated with vector store: {vector_store.id}")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable