In [1]:
import sys
sys.path.append("..") # Adds higher directory to python module path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

In [3]:
# Only 3 input variables
university_name = "Mannheim"
program_name = "master in management"
degree = "master degree"

In [4]:
from search.program_url_searcher import get_program_links

In [5]:
url_links = get_program_links(university_name, program_name, degree, number=2)

search: Mannheim master in management master degree introduction

links: ['https://www.uni-mannheim.de/studium/studienangebot/mannheim-master-in-management/', 'https://www.bwl.uni-mannheim.de/media/Fakultaeten/bwl/Dokumente/Studium/MMM/Modulkatalog_Mannheim_Master_in_Management_de.pdf']

search: Mannheim master in management master degree's application deadline

links: ["https://www.uni-mannheim.de/en/academics/dates/application-deadlines/#:~:text=Master's%20Programs&text=The%20application%20deadline%20for%20the,deadlines%20are%20subject%20to%20approval).", 'https://www.mim-essay.com/mannheim-mim']

search: Mannheim master in management master degree, which documents do i need to submit during the online application

links: ['https://www.uni-mannheim.de/en/academics/applying/the-a-to-z-of-applying/necessary-documents-for-the-application/', 'https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-management/']

search: Mannheim master in management master degree, where can 

In [6]:
from langchain.document_loaders import UnstructuredURLLoader
loaders = UnstructuredURLLoader(urls=url_links)
data = loaders.load()



Error fetching or processing https://www.bwl.uni-mannheim.de/media/Fakultaeten/bwl/Dokumente/Studium/MMM/Modulkatalog_Mannheim_Master_in_Management_de.pdf, exception: name 'partition_pdf' is not defined


In [7]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings


In [8]:
embedding_model = "text-embedding-ada-002"
embeddings = OpenAIEmbeddings(model=embedding_model)

In [9]:
# how to know the cost: https://github.com/langchain-ai/langchain/issues/945
# https://github.com/sbslee/kanu/blob/06deef8ae91ba2b949c5e504a220f3fcdace9cf9/kanu/utils.py#L15
# https://openai.com/pricing#language-models
def tokens2price(model, task, tokens):
    models = {
        "gpt-3.5-turbo"          : {"prompt": 0.0015, "completion": 0.002},
        "gpt-3.5-turbo-16k"      : {"prompt": 0.003,  "completion": 0.004},
        "gpt-4"                  : {"prompt": 0.03,   "completion": 0.06},
        "gpt-4-32k"              : {"prompt": 0.06,   "completion": 0.12},
        "text-embedding-ada-002" : {"embedding": 0.0001},
    }
    return models[model][task] / 1000 * tokens

def text2tokens(model, text):
    import tiktoken
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# divide first 1000 words as one chunk, overlap is 200 words. if your setence has dependency on previous sentence, better have a overlap
# if you have temperal dependency -> use overlap
# remember that overlap will be more expensive for embedding as well
text_splitter = RecursiveCharacterTextSplitter( 
                                      chunk_size=1000, 
                                      chunk_overlap=200)

docs = text_splitter.split_documents(data)

tokens = 0

for doc in docs:
        tokens += text2tokens(embedding_model, doc.page_content)
embedding_cost = tokens2price(embedding_model, "embedding", tokens)

In [11]:
len(docs)

124

In [12]:
# Create a temp DB
# Embed and store the texts
# Supplying a persist_directory will store the embedding on disk
persist_directory = 'tempDB'

vectordb = Chroma.from_documents(documents = docs,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)


In [13]:
print(f"Total Embedding Cost (USD): ${embedding_cost}")

Total Cost (USD): $0.0020278
