In [42]:
# pip install PyPDF2

In [43]:
# pip install wheel

In [44]:
# pip install pandas

In [45]:
# pip install tiktoken

In [46]:
# pip install openai

In [47]:
# pip install tenacity

In [48]:
import re
# Helper function to clean the description!
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    # description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # remove newlines
    description = description.replace('\n', ' ')
    description = description.replace('\\n', ' ')

    # clean up the spacing
    description = re.sub('\s{2,}', ' ', description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    # description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    # description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    # description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    # description = description.lower()

    return description.strip()

In [49]:
from PyPDF2 import PdfReader
import os
import pandas as pd

# Get all pdf files in the pdf directory
pages = []
for file in os.listdir('./pdf/'):
    # extractedText = ''
    reader = PdfReader('./pdf/' + file)
    for page in reader.pages:
        # extracting text from page and clean up
        processed_text = clean_description(page.extract_text())
        pages.append({
            'file_name': file,
            'page_number': reader.get_page_number(page),
            'text': processed_text
        })

In [50]:
import tiktoken

EMBEDDING_MODEL = 'text-embedding-ada-002'
MAX_INPUT_TOKENS = 8191
DELIMITER = '. '


def get_num_tokens(text: str, model: str = EMBEDDING_MODEL) -> int:
    # Return the number of tokens in a string
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def split_string_in_half(string) -> 'list[str, str]':
    length = len(string)
    half = length // 2

    first_half = string[:half]
    second_half = string[half:]

    return [first_half, second_half]


def halved_by_delimiter(string: str, num_tokens: int, delimiter: str) -> 'list[str, str]':
    """Split a string in two, on a delimiter, trying to balance tokens on each side."""
    chunks = string.split(delimiter)
    if len(chunks) == 1:
        # no delimiter found so split in half
        return split_string_in_half(string)
    
    if len(chunks) == 2:
        return chunks # no need to search for halfway point
    else:
        halfway = num_tokens // 2
        best_diff = halfway
        for i, chunk in enumerate(chunks):
            left = delimiter.join(chunks[: i + 1])
            left_tokens = get_num_tokens(left)
            diff = abs(halfway - left_tokens)
            if diff >= best_diff:
                break
            else:
                best_diff = diff
        left = delimiter.join(chunks[:i])
        right = delimiter.join(chunks[i:])
        return [left, right]

# TODO rewrite not to use recursions?
def chunk_text(text: str) -> 'list[str]':
    # split text into chunks
    num_tokens_in_string = get_num_tokens(text)

    if num_tokens_in_string <= MAX_INPUT_TOKENS:
        return [text]
    else:
        left, right = halved_by_delimiter(text, num_tokens_in_string, DELIMITER)
        # recurse on each half
        results = []
        for half in [left, right]:
            half_strings = chunk_text(half)
            results.extend(half_strings)
        return results


In [51]:
chunked_data = []
total_tokens = 0

for page in pages:
    for text in chunk_text(page['text']):
        chunked_data.append({
            'file_name': page['file_name'],
            'page_number': page['page_number'],
            'text': text
        })
        total_tokens += get_num_tokens(text)

df = pd.DataFrame(chunked_data)
df.tail(20)
print('total tokens: ', total_tokens)


total tokens:  48059


In [52]:
def batch_texts_up_to_max_token(texts: 'list[str]', max_token = MAX_INPUT_TOKENS) -> 'list[list[str]]':
    batched_texts = []
    token_so_far = 0
    batch = []
    for text in texts:
        token = get_num_tokens(text)
        if token + token_so_far > max_token:
            batched_texts.append(batch)
            batch = []
            token_so_far = 0
        batch.append(text)
        token_so_far += token
    if len(batch) > 0:
        batched_texts.append(batch)
    return batched_texts

batched = batch_texts_up_to_max_token(df.text.tolist())
flattened = [i for sublist in batched for i in sublist]
assert len(df.text.tolist()) == len(flattened) # make sure all texts are in batched

# print(len(df.text.tolist()))
# print(len(flattened))
# total_tokens = 0
# for string in flattened:
#     total_tokens += get_num_tokens(string)
# print('total tokens: ', total_tokens)

In [53]:
import openai
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type

openai.api_key = os.getenv("OPENAI_API_KEY")

# TODO use a script like api_request_parallel_processor.py to parallelize requests while throttling to stay under rate limits https://github.com/openai/openai-cookbook/blob/31a3a7b406eb452c9bb60e97902b55c25620b9d1/examples/api_request_parallel_processor.py


# let's make sure to not retry on an invalid request, because that is what we want to demonstrate
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(3), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(texts_or_tokens, model=EMBEDDING_MODEL):
    return openai.Embedding.create(input=texts_or_tokens, model=model)["data"]

embeddings = []
for batch in batched:
    response = get_embedding(batch)
    for i, data in enumerate(response):
        assert i == data["index"] # double check embeddings are in same order as input
    batch_emgeddings = [data["embedding"] for data in response]
    embeddings.extend(batch_emgeddings)

assert(len(embeddings) == len(flattened))

df['embeddings'] = embeddings
df.tail(20)


0 {
  "object": "embedding",
  "index": 0,
  "embedding": [
    0.011717384681105614,
    0.012546475976705551,
    0.02488567866384983,
    -0.012358634732663631,
    -0.003591647371649742,
    0.013304316438734531,
    -0.009301362559199333,
    -0.008148408494889736,
    -0.007766249123960733,
    0.002909914357587695,
    -0.0203386340290308,
    0.018499089404940605,
    -0.03831953927874565,
    -0.005029601510614157,
    -0.003938181325793266,
    -0.01348568033427,
    0.01760522462427616,
    -0.019975906237959862,
    0.01253352127969265,
    -0.029277268797159195,
    -0.014314770698547363,
    0.036454085260629654,
    -0.03399272263050079,
    -0.009528066962957382,
    0.013032271526753902,
    0.003429715521633625,
    0.007345226127654314,
    -0.0208179522305727,
    -0.008815567009150982,
    -0.0269454512745142,
    0.008828521706163883,
    0.0012347300071269274,
    -0.01930226944386959,
    -0.01994999684393406,
    0.01390022598206997,
    -0.024186132475733757,


Unnamed: 0,file_name,page_number,text,embeddings
122,2023 Venza Quick Reference Guide.pdf,54,53(1) Press switches and select (7-in display)...,"[0.007605000399053097, 0.0013902155915275216, ..."
123,2023 Venza Quick Reference Guide.pdf,55,54SAFETY & EMERGENCY FEATURES Safety Connect S...,"[-0.00034887916990555823, -0.00691017881035804..."
124,2023 Venza Quick Reference Guide.pdf,56,55 OVERVIEW FEATURES & OPERATIONS TOYOTA SAFET...,"[0.000324595341226086, -0.01599782146513462, 0..."
125,2023 Venza Quick Reference Guide.pdf,57,56SAFETY & EMERGENCY FEATURES The tire pressur...,"[0.0008550581987947226, 0.009235624223947525, ..."
126,2023 Venza Quick Reference Guide.pdf,58,57 OVERVIEW FEATURES & OPERATIONS TOYOTA SAFET...,"[-0.012173568829894066, 0.004775322508066893, ..."
127,2023 Venza Quick Reference Guide.pdf,59,58Floor mat installationSAFETY & EMERGENCY FEA...,"[-0.003626291174441576, 0.009054179303348064, ..."
128,2023 Venza Quick Reference Guide.pdf,60,59Floor mat installation has model-specific fl...,"[-0.003681750502437353, 0.005570851266384125, ..."
129,2023 Venza Quick Reference Guide.pdf,61,60Do not attempt the process while driving.our...,"[-0.015380118042230606, 0.0023981714621186256,..."
130,2023 Venza Quick Reference Guide.pdf,62,61STEP 6STEP 2STEP 1 Do not attempt the proces...,"[-0.004304513335227966, 0.00824757944792509, -..."
131,2023 Venza Quick Reference Guide.pdf,63,62GETTING STARTED WITH Toyota app Pairing1 Blu...,"[-0.0018650945276021957, -0.005736230872571468..."


In [54]:
import pickle

def export_to_pkl(df: pd.DataFrame):
  # Export to file!
  with open('manuals_embeddings.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

def export_to_csv(df: pd.DataFrame):
  df.to_csv('manuals_embeddings.csv', index=False)


In [55]:
export_to_csv(df)
export_to_pkl(df)