In [2]:
import pandas as pd
import os
import tiktoken
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

DOMAIN = "developer.mozilla.org"


In [10]:
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv("../processed/scraped.csv", index_col=0)
df.columns = ["title", "text"]
print(f"Len of scraped is {len(df)}")
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))

chunk_size = 1000  # max number of tokens

text_splitter = RecursiveCharacterTextSplitter(
    length_function = len,  
    chunk_size = chunk_size,
    chunk_overlap  = 0,  # No overlap between chunks
    add_start_index = False,  # We don't need start index in this case
)

shortened = []

for row in df.iterrows():
    if row[1]["text"] is None:
        continue
    if row[1]["n_tokens"] > chunk_size:
        print(f"Chunking row {row[1]['title']}")
        chunks = text_splitter.create_documents([row[1]["text"]])
        for chunk in chunks:
            shortened.append(chunk.page_content)
    else:
        shortened.append(row[1]["text"])

df = pd.DataFrame(shortened, columns=["text"])
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))

Len of scraped is 268
Chunking row developer.mozilla.org/en-US/docs/Web/HTML/Attributes
Chunking row developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators#assignment/operators
Chunking row developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical/grammar#exponential
Chunking row developer.mozilla.org/en-US/docs/Web/API/HTMLElement/dragover/event
Chunking row developer.mozilla.org/en-US/docs/Web/Security/Referer/header:/privacy/and/security/concerns
Chunking row developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/const#description
Chunking row developer.mozilla.org/en-US/docs/Web/Accessibility/Understanding/WCAG/Perceivable#guideline/1.4/make/it/easier/for/users/to/see/and/hear/content/including/separating/foreground/from/background
Chunking row developer.mozilla.org/en-US/docs/Web/CSS/Value/definition/syntax#single/bar
Chunking row developer.mozilla.org/en-US/docs/Web/API/Node/removeChild
Chunking row developer.mozilla.org/en-US/docs/Web/JavaScrip

In [None]:
len(df)

In [None]:
def create_embedding(x):
    return openai.embeddings.create(
        input=x, model='text-embedding-3-small').data[0].embedding

df_embeds = pd.DataFrame(shortened, columns=["text"])
df_embeds["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df_embeds['embeddings'] = ""
print(f"Total rows to process: {len(df)}")
for i, row in df.iterrows():
    print(f"Processing row {i}")
    embed_vector = create_embedding(row['text'])
    df_embeds['embeddings'][i] = embed_vector

In [46]:
df_embeds.to_csv("../processed/embeddings.csv")