In [1]:
# !pip install openai pypdf2 qdrant-client llama-index llama-index-embeddings-openai

In [2]:
import os

In [3]:
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

In [5]:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "system",
            "content": "You are an expert in geography and you answer with a stark, northern accent from game of thrones",
        },
        {"role": "user", "content": "What is the capital of Saudi Arabia?"},
    ],
)

In [6]:
response.model_dump()

{'id': 'chatcmpl-9KPmpbDHtm6y3GIdh7SMlLnlUjOUF',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': "It's Riyadh, from the vast lands of the south, it be.",
    'role': 'assistant',
    'function_call': None,
    'tool_calls': None}}],
 'created': 1714652711,
 'model': 'gpt-4-0613',
 'object': 'chat.completion',
 'system_fingerprint': None,
 'usage': {'completion_tokens': 15, 'prompt_tokens': 39, 'total_tokens': 54}}

In [7]:
response.choices[0].message.content

"It's Riyadh, from the vast lands of the south, it be."

In [8]:
import re

from PyPDF2 import PdfReader


def load_files(dir: str) -> str:
    text = ""
    data_dir = os.path.join(os.getcwd(), dir)
    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            reader = PdfReader(filename)
            for page in reader.pages:
                content = page.extract_text()
                if content:
                    text += content

        if filename.endswith(".txt"):
            with open(os.path.join(data_dir, filename), "r") as f:
                text += f.read()

    text = re.sub(r"\s+", " ", text)

    return text

In [9]:
txt = load_files("../data/holmes")

In [10]:
txt



In [11]:
client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that will be given stories and be asked questions about them.",
        },
        {
            "role": "user",
            "content": f"here is your knoweldge: {txt},  ----- now answer this question: Who is the main character in the story?",
        },
    ],
    max_tokens=100,
)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 219689 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [12]:
client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that will be given stories and be asked questions about them.",
        },
        {
            "role": "user",
            "content": f"here is your knoweldge: {txt[:100000]},  ----- now answer this question: Who is the main character in the story?",
        },
    ],
    max_tokens=100,
)

ChatCompletion(id='chatcmpl-9KPmyPDgDaaFlrb0oYyiBQxmtxycG', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The main character in the story is Sherlock Holmes. He is a consulting detective known for solving unusual and complex cases with his keen observational skills, logical reasoning, and the ability to deduce information that others often overlook. The stories frequently feature Dr. John Watson, Holmes's friend and chronicler of his adventures, who also plays a significant role across the narratives.", role='assistant', function_call=None, tool_calls=None))], created=1714652720, model='gpt-4-turbo-2024-04-09', object='chat.completion', system_fingerprint='fp_ea6eb70039', usage=CompletionUsage(completion_tokens=72, prompt_tokens=23358, total_tokens=23430))

In [13]:
from qdrant_client import QdrantClient, models

In [14]:
qdrant_client = QdrantClient(path="../data/vector-store")

In [15]:
collection_name = "holmes-stories"

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

True

In [16]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

In [17]:
def create_documents(text: str) -> list[str]:
    text_splitter = TokenTextSplitter(chunk_size=8000, chunk_overlap=300)
    documents = text_splitter.split_text(text)
    return documents

In [18]:
docs = create_documents(txt)
docs

['THE ADVENTURE OF THE SPECKLED BAND On glancing over my notes of the seventy odd cases in which I have during the last eight years studied the methods of my friend Sherlock Holmes, I find many tragic, some comic, a large number merely strange, but none commonplace; for, working as he did rather for the love of his art than for the acquirement of wealth, he refused to associate himself with any investigation which did not tend towards the unusual, and even the fantastic. Of all these varied cases, however, I cannot recall any which presented more singular features than that which was associated with the well-known Surrey family of the Roylotts of Stoke Moran. The events in question occurred in the early days of my association with Holmes, when we were sharing rooms as bachelors in Baker Street. It is possible that I might have placed them upon record before, but a promise of secrecy was made at the time, from which I have only been freed during the last month by the untimely death of t

In [19]:
def create_embeddings(docs: list[str]) -> list[float]:
    embed_model = OpenAIEmbedding(
        model="text-embedding-3-small", api_key=OPENAI_API_KEY
    )
    embeddings = [embed_model.get_text_embedding(doc) for doc in docs]
    return embeddings

In [20]:
embedded_docs = create_embeddings(docs)
embedded_docs

[[0.006432522553950548,
  0.03977116569876671,
  -0.052572064101696014,
  0.007056174334138632,
  -0.02925105020403862,
  0.04903684929013252,
  0.01602250523865223,
  -0.03481046110391617,
  -0.016863543540239334,
  -0.0144188292324543,
  0.030419951304793358,
  -0.04667053371667862,
  -0.025673070922493935,
  -0.006457468494772911,
  0.07406844943761826,
  0.0028153422754257917,
  0.022993149235844612,
  0.04048391059041023,
  -0.024090776219964027,
  0.010904996655881405,
  0.023392286151647568,
  0.0016847506631165743,
  0.01305035874247551,
  -0.04022732004523277,
  -0.010997653938829899,
  0.02661389298737049,
  0.012358996085822582,
  0.0032251705415546894,
  0.050947003066539764,
  0.004265778232365847,
  0.0001240621495526284,
  -0.005758978426456451,
  0.0035120504908263683,
  -0.011026163585484028,
  -0.030762068927288055,
  0.02247997373342514,
  -0.014447338879108429,
  -0.04299277067184448,
  -0.019343895837664604,
  -0.03495300933718681,
  0.04629990831017494,
  -0.05739

In [21]:
qdrant_client.upload_points(
    collection_name=collection_name,
    points=[
        models.PointStruct(
            id=idx,
            vector=doc,
        )
        for idx, doc in enumerate(embedded_docs)
    ],
)

In [22]:
def get_query(query: str) -> models.ScoredPoint:
    embed_model = OpenAIEmbedding(
        model="text-embedding-3-small", api_key=OPENAI_API_KEY
    )
    embedded_query = embed_model.get_text_embedding(query)
    res = qdrant_client.search(
        collection_name=collection_name,
        query_vector=embedded_query,
        limit=3,
    )
    return res

In [23]:
vectors = get_query("Who is the main characters in the THE NAVAL TREATY story?")
vectors

[ScoredPoint(id=19, version=0, score=0.3653306235987599, payload={}, vector=None, shard_key=None),
 ScoredPoint(id=8, version=0, score=0.3334691919288226, payload={}, vector=None, shard_key=None),
 ScoredPoint(id=6, version=0, score=0.28413681149024894, payload={}, vector=None, shard_key=None)]

In [27]:
def get_response(prompt: str, context: str) -> str:
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that will be given stories and be asked questions about them.",
        },
        {
            "role": "user",
            "content": f"here is your knoweldge: {context},  ----- now answer this question: {prompt}",
        },
    ]
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=messages,
        temperature=0.125,
    )
    return response.choices[0].message.content

In [28]:
def get_actual_query(prompt, vectors):
    docuemnt = docs[vectors[0].id]
    return get_response(prompt, docuemnt)

In [29]:
get_actual_query(
    prompt="Who is the main characters in the THE NAVAL TREATY story?", vectors=vectors
)

'The main characters in the Sherlock Holmes story "The Naval Treaty" are:\n\n1. **Sherlock Holmes** - The famous detective who is the protagonist of the story.\n2. **Dr. John Watson** - Holmes\' friend, confidant, and the narrator of the story.\n3. **Percy Phelps** - An old school friend of Watson\'s who works in the Foreign Office and is the victim in the story, having had an important naval treaty stolen from him.\n4. **Joseph Harrison** - Phelps\' fiancée\'s brother, who becomes a key figure in the resolution of the mystery.\n5. **Annie Harrison** - Phelps\' fiancée, who is also Joseph Harrison\'s sister.\n\nThese characters are central to the unfolding of the plot in "The Naval Treaty."'