In [1]:
from langchain_ollama import OllamaLLM
import os
import shutil
from langchain.prompts import ChatPromptTemplate
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.tools import tool
from langchain.tools.render import render_text_description
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.schema.document import Document

In [2]:
model = OllamaLLM(
    model = 'deepseek-r1:8b',
    base_url = "127.0.0.1:11434"
)

def get_emb():
    emb = OllamaEmbeddings(model='nomic-embed-text')

    return emb

In [3]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

def load_docs():
    doc_load = PyPDFDirectoryLoader(DATA_PATH)
    return doc_load.load()

def split_docs_chunks(docs: list[Document]):
    text_split = RecursiveCharacterTextSplitter(
        chunk_size = 800,
        chunk_overlap = 80,
        is_separator_regex = False,
    )

    return text_split.split_documents(docs)

def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        persist_directory = CHROMA_PATH, embedding_function = get_emb()
    )

    chunks_with_id = calculate_chunk_id(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_id:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

def calculate_chunk_id(chunks):
    last_page_id = None
    current_chunk_idx = 0

    for ch in chunks:
        source = ch.metadata.get("source")
        page = ch.metadata.get("page")
        current_page_id = f"{source}:{page}"

        if current_page_id == last_page_id:
            current_chunk_idx += 1
        else:
            current_chunk_idx = 0

        chunk_id = f"{current_page_id}:{current_chunk_idx}"
        last_page_id = current_page_id

        ch.metadata["id"] = chunk_id

    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

In [4]:
docs = load_docs()
text_chunks = split_docs_chunks(docs)

In [5]:
for doc in docs:
    print(f"doc_metadata: {doc.metadata}")
    print(f"doc_content: {doc.page_content}\n")

doc_metadata: {'producer': 'Adobe Acrobat 7.0 Paper Capture Plug-in', 'creator': 'Adobe Acrobat 7.0', 'creationdate': '2007-05-03T12:38:10-04:00', 'moddate': '2007-05-03T12:52:41-04:00', 'source': 'data/monopoly.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
doc_content: MONOPOLY 
Property Trading Game from Parker Brothers" 
AGES 8+ 
2 to 8 Players 
Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance 
and Community Chest cards, Title Deed cards, play money and a Banker's tray. 
Now there's a faster way to play MONOPOLY. Choose to play by 
the classic rules for buying, renting and selling properties or use the 
Speed Die to get into the action faster. If you've never played the classic 
MONOPOLY game, refer to the Classic Rules beginning on the next page. 
If you already know how to play and want to use the Speed Die, just 
read the section below for the additional Speed Die rules. 
SPEED DIE RULES 
Learnins how to Play with the S~eed Die IS as 
/ 
fast as playing w

In [6]:
for ch in text_chunks:
    print(ch.metadata)

{'producer': 'Adobe Acrobat 7.0 Paper Capture Plug-in', 'creator': 'Adobe Acrobat 7.0', 'creationdate': '2007-05-03T12:38:10-04:00', 'moddate': '2007-05-03T12:52:41-04:00', 'source': 'data/monopoly.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
{'producer': 'Adobe Acrobat 7.0 Paper Capture Plug-in', 'creator': 'Adobe Acrobat 7.0', 'creationdate': '2007-05-03T12:38:10-04:00', 'moddate': '2007-05-03T12:52:41-04:00', 'source': 'data/monopoly.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
{'producer': 'Adobe Acrobat 7.0 Paper Capture Plug-in', 'creator': 'Adobe Acrobat 7.0', 'creationdate': '2007-05-03T12:38:10-04:00', 'moddate': '2007-05-03T12:52:41-04:00', 'source': 'data/monopoly.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}
{'producer': 'Adobe Acrobat 7.0 Paper Capture Plug-in', 'creator': 'Adobe Acrobat 7.0', 'creationdate': '2007-05-03T12:38:10-04:00', 'moddate': '2007-05-03T12:52:41-04:00', 'source': 'data/monopoly.pdf', 'total_pages': 8, 'page': 1, 'page_label

In [7]:
res = model.invoke("Hi")
print(res)

<think>
Okay, the user just said “Hi” – pretty simple and neutral. 

Hmm, they might be testing the waters or just greeting casually. Since it's a cold start, keeping it friendly but not overly enthusiastic is probably safest. No history to reference yet, so I'll default to general tone: warm smile emoji + open-ended question to nudge conversation in a productive direction.

The lack of specific context makes me think they're either:
A) Taking time to decide what to ask (common human behavior)
B) Just checking if the assistant is responsive
C) Possibly feeling hesitant or unsure how to phrase requests

I should position myself as approachable but not pushy. The smile emoji softens the tone while “how can I help you” gives them an easy out – they don't have to commit to anything specific yet. 

Wondering if this is a logged-in vs unlogged-in interaction difference, but probably irrelevant for now unless they mention it later.
</think>
Hi there! 😊 How can I help you today?


In [8]:
add_to_chroma(text_chunks)

  emb = OllamaEmbeddings(model='nomic-embed-text')
  db = Chroma(


Number of existing documents in DB: 35
✅ No new documents to add


In [9]:
db = Chroma(persist_directory = CHROMA_PATH, embedding_function = get_emb())
results = db.similarity_search_with_score("what is the task of intership", k=5)

In [10]:
for info in results:
    print(info)

(Document(metadata={'producer': 'Skia/PDF m138 Google Docs Renderer', 'source': "data/Wasserstoff Gen-AI Internship Task.pdf; filename_=ISO-8859-1''Wasserstoff_20Gen-AI_20Internship_20Task.pdf", 'title': 'Wasserstoff Gen-AI Internship Task', 'total_pages': 8, 'id': "data/Wasserstoff Gen-AI Internship Task.pdf; filename_=ISO-8859-1''Wasserstoff_20Gen-AI_20Internship_20Task.pdf:0:0", 'page': 0, 'page_label': '1', 'creationdate': '', 'creator': 'PyPDF'}, page_content='Wasserstoff  –  AI  Software  Intern  Task  \n Role:  AI  Intern  (Generative  AI)  –  6  Month  Full-Time  Internship  \n \nLocation:\n \nRemote/Onsite\n \n \nCompany:\n \nWasserstoff\n \n \nContact:\n \nDivyansh\n \nSharma\n \n–\n \ndivyansh.sharma@thewasserstoff.com\n \n \nOverview  \nWelcome  to  Wasserstoff!  \nAs  an  AI  Intern  for  6  months  (full-time),  you  will  engage  in  research-driven  development  \nof\n \nGenerative\n \nAI\n \napplications.\n \nThe\n \ninternship\n \nemphasizes\n \nboth\n \nacademic\n \n

In [11]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [12]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question = "what is the task of intership")

In [13]:
print(prompt_template)
print(prompt)
print(len(prompt))

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nAnswer the question based only on the following context:\n\n{context}\n\n---\n\nAnswer the question based on the above context: {question}\n'), additional_kwargs={})]
Human: 
Answer the question based only on the following context:

Wasserstoff  –  AI  Software  Intern  Task  
 Role:  AI  Intern  (Generative  AI)  –  6  Month  Full-Time  Internship  
 
Location:
 
Remote/Onsite
 
 
Company:
 
Wasserstoff
 
 
Contact:
 
Divyansh
 
Sharma
 
–
 
divyansh.sharma@thewasserstoff.com
 
 
Overview  
Welcome  to  Wasserstoff!  
As  an  AI  Intern  for  6  months  (full-time),  you  will  engage  in  research-driven  development  
of
 
Generative
 
AI
 
applications.
 
The
 
internship
 
emphasizes
 
both
 
academic
 
research
 
and
 
hands-on
 
implementation,
 
contributi

In [14]:
response = model.invoke(prompt)
print(response)

<think>
Okay, so the user wants to know about the internship task based on the provided context. Let me start by reading through the given information carefully.

The title says "Wasserstoff – AI Software Intern Task" and specifies a Generative AI role for 6 months. The location is remote or onsite, contact info is Divyansh Sharma, and there's an overview section. Then there's a checklist with several components including document processing, storage, search functionality, extracting answers, theme identification, code quality, and a demo video.

Wait, the user question is asking what the task of the internship is. The main points from the context are that as an AI intern, you'll develop a Generative AI application. Specific tasks include handling 75+ documents with various formats (PDF, text, images using OCR), storing them for fast search, allowing natural language queries, extracting and citing answers, identifying common themes across the documents, creating a web interface to mana

In [15]:
@tool
def multiply(a: int, b: int) -> int:
   """Multiply two numbers."""
   return a * b

In [None]:
multiply.invoke({'a': 5, 'b':6})

30

In [14]:
print(multiply.name)
print(multiply.description)
print(multiply.args)

multiply
Multiply two numbers.
{'a': {'title': 'A', 'type': 'integer'}, 'b': {'title': 'B', 'type': 'integer'}}
