## Scripts to build vector databases needed for RAG and fine tuning
In this file there will be scripts for getting data from python docs and stack overflow

In [None]:
!where python

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [12]:
python_versions = ["3.10","3.11","3.12","3.13"]

In [None]:

def extract_text_preserving_structure(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    structured_text = []

    # Pobieramy elementy w takiej kolejności, jak występują w HTML
    for el in soup.find_all(["h1", "h2", "h3", "p", "li", "blockquote", "code", "pre", "div", "span"]):
        tag = el.name
        text = el.get_text(separator=" ", strip=True)

        if tag == "pre" or (tag == "div" and "pre" in el.get("class", [])):
            structured_text.append(f"```python\n{text}\n```")

        elif tag == "code":
            structured_text.append(f"`{text}`")

        elif tag == "span" and "pre" in el.get("class", []):
            structured_text.append(f"`{text}`")

        elif tag in ["h1", "h2", "h3"]:
            structured_text.append(f"\n## {text}\n")

        elif tag == "li":
            structured_text.append(f"- {text}")

        elif tag == "blockquote":
            structured_text.append(f"> {text}")

        else:
            structured_text.append(text)

    return "\n".join(structured_text)

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

python_docs = []

for version in python_versions:

    website_url = f"https://docs.python.org/{version}/library/"
    index_url = urljoin(website_url, "index.html")
    
    try:
        response = requests.get(index_url)
        response.raise_for_status()
    
        soup = BeautifulSoup(response.text, "html.parser")
    
        links = soup.find_all("a")

        for link in links:
            if link.has_attr("href"):
                href = link["href"]
                print(f"Original href: {href}")
    
                base_href, _, fragment = href.partition("#")
    
                new_link = urljoin(website_url, base_href)
    
                if not new_link.startswith(website_url):
                    continue
    
                try:
                    new_link_response = requests.get(new_link)
                    new_link_response.raise_for_status()
    
                    new_soup = BeautifulSoup(new_link_response.text, "html.parser")
    
                    wrapper_div = new_soup.find("div", class_="bodywrapper")
    
                    if wrapper_div:
                        formatted_text = extract_text_preserving_structure(str(wrapper_div))
                        python_docs.append({"version": version, "url": new_link, "text": formatted_text})
                        
                except requests.exceptions.RequestException as e:
                    print(f"Błąd pobierania")
        print(f"Succesfully loaded python {version} documentation")
    
    except requests.exceptions.RequestException as e:
        print(f"Błąd pobierania strony głównej")

In [None]:

print(python_docs[0]['text'][:100])

In [None]:
import pickle

In [None]:
with open("backup.pkl", "wb") as f:
    pickle.dump(python_docs, f)


In [None]:
with open("backup.pkl", "rb") as f:
    python_docs = pickle.load(f)

### StackOverflow

In [2]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
STACK_EXCHANGE_API_KEY = os.getenv("STACK_EXCHANGE_API_KEY")

In [5]:
def fetch_stackoverflow_questions(tag="python", intitle=None, page=1, pagesize=100):
    url = "https://api.stackexchange.com/2.3/search"
    params = {
        "order": "desc",
        "sort": "creation",
        "tagged": tag,
        "site": "stackoverflow",
        "pagesize": pagesize,
        "page": page,
        "filter": "!9_bDDxJY5",
        "key": STACK_EXCHANGE_API_KEY
    }
    
    if intitle:
        params["intitle"] = intitle

    retries = 3
    for attempt in range(retries):
        response = requests.get(url, params=params)
        print(response.status_code)
        
        if response.status_code == 200:
            return response.json().get("items", [])

        elif response.status_code == 429:
            wait_time = 5 * (attempt + 1)
            time.sleep(wait_time)

        else:
            break
    else:
        return []

In [6]:
import time

In [7]:
def fetch_all_stackoverflow_questions(tag="python", intitle=None, max_pages=5, pagesize=20):
    all_questions = []
    for page in range(1, max_pages + 1):
        questions = fetch_stackoverflow_questions(tag, intitle, page, pagesize)
        if not questions:
            break
        all_questions.extend(questions)
        time.sleep(2)

    return all_questions


In [8]:
def fetch_best_answer(question_id):
    url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers"
    params = {
        "order": "desc",
        "sort": "votes",
        "site": "stackoverflow",
        "filter": "withbody"
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        answers = response.json().get("items", [])
        if answers:
            return answers[0]["body"] 
    return "Brak dostępnej odpowiedzi."

In [17]:
def clean_stackoverflow_text(html_text):
    """ Usuwa HTML i konwertuje kod na format Markdown """
    soup = BeautifulSoup(html_text, "html.parser")
    formatted_text = []

    for el in soup.find_all(["p", "pre", "code", "li"]):
        tag = el.name
        text = el.get_text(separator=" ", strip=True)

        if tag == "pre":
            code = el.find("code")
            if code:
                text = f"```python\n{code.get_text()}\n```"
            else:
                text = f"```python\n{text}\n```"

        elif tag == "code" and el.parent.name != "pre":
            text = f"`{text}`"

        elif tag == "li":
            text = f"- {text}"

        formatted_text.append(text)

    return "\n".join(formatted_text)

In [13]:
all_questions = []

for version in python_versions:
    questions_loaded = fetch_all_stackoverflow_questions(tag="python", intitle=f"Python {version}", max_pages=5, pagesize=30)
    all_questions.append({"questions": questions_loaded, "version": version})

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [18]:
stackoverflow_data = []

for element in all_questions:
    questions = element["questions"]
    version = element["version"]
    
    for i, q in enumerate(questions):
        question_id = q["question_id"]
        best_answer = fetch_best_answer(question_id)

        formatted_question = clean_stackoverflow_text(q["body"])
        formatted_answer = clean_stackoverflow_text(best_answer)

        stackoverflow_data.append({
            "tag": version,
            "question": q["title"],
            "question_body": formatted_question,
            "answer": formatted_answer,
            "link": q["link"]
        })

        if i % 40 == 0:
            print(f"{i} / {len(questions)}")

    print(f"Python {version} successfully finished") 


0 / 150
40 / 150
80 / 150
120 / 150
Python 3.10 successfully finished
0 / 150
40 / 150
80 / 150
120 / 150
Python 3.11 successfully finished
0 / 123
40 / 123
80 / 123
120 / 123
Python 3.12 successfully finished
0 / 31
Python 3.13 successfully finished


In [33]:
print(len(stackoverflow_data))

454


## Stworzenie dynamicznych chunkow

In [None]:
load_dotenv()

### Dokumentacja python

In [19]:
OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")

In [20]:
import openai

In [21]:
openai.api_key = OPEN_AI_API_KEY

In [22]:
import re

In [23]:
import tiktoken
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

In [None]:

def split_by_headers(text, max_chunk_size=512, chunk_overlap=100):
    
    sections = re.split(r'\n(## [^\n]+)\n', text) 
    chunks = []
    current_chunk = ""

    for section in sections:
        if section.startswith("## "):
            if current_chunk: 
                chunks.append(current_chunk)
            current_chunk = section + "\n" 
        else:
            section_tokens = tokenizer.encode(section)

            if len(tokenizer.encode(current_chunk)) + len(section_tokens) <= max_chunk_size:
                current_chunk += section + "\n"
            else:
                for i in range(0, len(section_tokens), max_chunk_size - chunk_overlap):
                    chunk_tokens = section_tokens[i:i + max_chunk_size]
                    chunk_text = tokenizer.decode(chunk_tokens)
                    
                    if current_chunk:
                        chunks.append(current_chunk)
                    
                    current_chunk = chunk_text

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [24]:
def get_openai_embedding(text):
    response = openai.embeddings.create(
        input=[text],
        model="text-embedding-ada-002"
    )
    return response.data[0].embedding

In [None]:
python_docs_chunks = []

for doc in python_docs:
    chunks = split_by_headers(doc["text"]) 

    for chunk in chunks:
        python_docs_chunks.append({
            "version": doc["version"],
            "url": doc["url"],
            "text": chunk
        })

In [None]:
print(len(python_docs_chunks))

In [None]:
with open("backup_tokens.pkl", "wb") as f:
    pickle.dump(python_docs_chunks, f)

In [None]:
with open("backup_tokens.pkl", "rb") as f:
    python_docs_chunks = pickle.load(f)

In [None]:
tokens = tokenizer.encode(python_docs_chunks[0]['text'])

In [None]:
print(tokens)

## ChromaDB

In [25]:
import chromadb

client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(name="python_data")

### Saving python docs chunks to chroma_db

In [None]:
i = 0
while i < len(python_docs):
    python_docs[i]['id'] = i
    i += 1

In [None]:
print(python_docs[0])

In [None]:
i = 0
python_docs_chunks = []


for doc in python_docs:
    chunks = split_by_headers(doc["text"]) 

    for chunk in chunks:
        python_docs_chunks.append({
            "version": doc['version'],
            "url": doc['url'],
            "text": chunk,
            "id": i
        })
        i += 1

In [None]:
max_tokens = max(len(tokenizer.encode(chunk['text'])) for chunk in python_docs_chunks)

In [None]:
print(max_tokens)

In [None]:
print(len(python_docs_chunks))

In [None]:
print(len(python_docs))

In [None]:
print(python_docs_chunks[0])

In [None]:
client.delete_collection("python_data")

collection = client.get_or_create_collection(name="python_data")

In [None]:
index_py = 19730

while index_py < len(python_docs_chunks):
    if "embedding" in python_docs_chunks:
        continue
    doc = python_docs_chunks[index_py]
    chunk = doc["text"]
    embedding = get_openai_embedding(chunk)
    doc_id = f"{doc['version']}_{doc['id']}"

    collection.add(
        ids=[doc_id],
        embeddings=[embedding],
        metadatas=[{"version": doc["version"], "url": doc["url"]}],
        documents=[chunk]
    )

    python_docs_chunks[index_py]["id"] = doc_id
    python_docs_chunks[index_py]["embedding"] = embedding
    index_py += 1

    if index_py % 5 == 0:
        print(index_py, end="")

In [None]:
k = 19730

for kk in range(k, k + 5):
    print('embedding' in python_docs_chunks[kk])

In [None]:
print(len(tokenizer.encode(python_docs_chunks[32123]['text'])))

### stackoverflow

In [26]:
def chunk_stackoverflow_entry(question, question_body, answer, min_chunk_size=256, max_chunk_size=512, chunk_overlap=100):
    
    full_text = f"🔹 **Question:** {question}\n\n{question_body}\n\n🔹 **Answer:**\n{answer}"
    tokens = tokenizer.encode(full_text)
    chunks = []

    if len(tokens) <= min_chunk_size:
        return [full_text]
    
    for i in range(0, len(tokens), max_chunk_size - chunk_overlap):
        chunk_tokens = tokens[i:i + max_chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)

    return chunks


In [36]:
stackoverflow_chunks = []
index = 0

for entry in stackoverflow_data:
    chunks = chunk_stackoverflow_entry(
        entry["question"], entry["question_body"], entry["answer"]
    )

    for i, chunk in enumerate(chunks):
        stackoverflow_chunks.append({
            "id": f"stack_{index}",
            "question": entry["question"],
            "text": chunk,
            "url": entry["link"],
            "version": entry["tag"]
        })

        index += 1

print("Finished processing chunks")


Finished processing chunks


In [42]:
index_so = 0

while index_so < len(stackoverflow_chunks):
    doc = stackoverflow_chunks[index_so]
    
    if "embedding" in doc:
        index_so += 1
        continue

    chunk = doc["text"]
    embedding = get_openai_embedding(chunk)
    doc_id = f"stackoverflow_{doc['id']}"

    collection.add(
        ids=[doc_id],
        embeddings=[embedding],
        metadatas=[{"source": "stackoverflow", "question": doc["question"], "url": doc["url"], "version": doc["version"]}],
        documents=[chunk]
    )

    stackoverflow_chunks[index_so]["id"] = doc_id
    stackoverflow_chunks[index_so]["embedding"] = embedding

    if index_so % 20 == 0:
        print(index_so, end="")

    index_so += 1

02040608010012014016018020022024026028030032034036038040042044046048050052054056058060062064066068070072074076078080082084086088090092094096098010001020104010601080110011201140116011801200122012401260128013001320134013601380140014201440146014801500152015401560

In [27]:
import chromadb

In [28]:

client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(name="python_data")

In [43]:
print(collection.count())

93929


In [29]:
all_metadatas = collection.get()["metadatas"]
all_ids = collection.get()["ids"]

stackoverflow_ids = [doc_id for doc_id, metadata in zip(all_ids, all_metadatas) if metadata.get("source") == "stackoverflow"]

print(len(stackoverflow_ids))



1412


In [30]:
collection.delete(ids=stackoverflow_ids)


Add of existing embedding ID: stackoverflow_i-am-using-python-3-12-and-python-3-10-but-python-3-10-not-working-with-msys2_0
Add of existing embedding ID: stackoverflow_why-is-apache-tvm-available-for-python-3-10-but-not-python-3-12_0
Add of existing embedding ID: stackoverflow_why-does-it-take-longer-to-execute-a-simple-for-loop-in-python-3-12-than-in-pyth_0
