In [2]:
import numpy as np

from DataIngestion.configurated_website_loaders import news_article_loader
from Preprocessing.text_extraction import SimpleBS4TextExtractor
from Preprocessing.text_spitting import RecursiveTextSplitter
from Preprocessing.image_loaders import RequestsImageLoader
from Preprocessing.image_describer import BLIPImageDescriber
from Embedding.text_embedding import SentenceTransformerTextEmbedding
from VectorStore.chroma_vector_store import ChromaVectorStore
from Internals.adapters import ChromaTextEmbeddingAdapter
from LLM.rag_llm import OllamaRAGLLM

BASE_URL = "https://www.deeplearning.ai/the-batch"
NEW_URL = "https://www.deeplearning.ai/the-batch/issue-284/"

  model: Ollama = pydantic.Field(default=Ollama(model='llama3.2'))


In [3]:
loaded_data = news_article_loader.load(NEW_URL)
loaded_data

ParsedData(url=https://www.deeplearning.ai/the-batch/issue-284/, parsed_tags=dict_keys(['Title', 'Author', 'Published_date', 'Content', 'Summary', 'Tags', 'Paragraph', 'Image']))

In [6]:
text_extractor = SimpleBS4TextExtractor()
image_loader = RequestsImageLoader()

text = text_extractor.extract_text_from_elements(loaded_data.get_all())
images = []
for img in loaded_data.image:
    loaded_img = image_loader.load(img['src'])
    if loaded_img is not None:
        images.append(loaded_img)

In [7]:
text

"issue 284\nNews\nDeepSeek Ups the Open Weights Ante\nU.S. Moves to Expand AI Export Restrictions\nAI Supercomputer on Your Desk\nCalibrating Contrast\n\n\nExplore Courses\nAI Newsletter\n\nCommunity\n\nResources\nCompany\n\n\n\n\n\n\n\n✨ New course! Enroll in Reinforcement Fine-Tuning LLMs with GRPO Explore Courses AI Newsletter The Batch Andrew's Letter Data Points ML Research Blog Community Forum Events Ambassadors Ambassador Spotlight Resources Company About Careers Contact Start Learning Weekly Issues Andrew's Letters Data Points ML Research Business Science Culture Hardware AI Careers About Subscribe The Batch Weekly Issues issue 284 Published Jan 15, 2025 Reading time 13 min read Published Jan 15, 2025 Reading time 13 min read Share Dear friends, Writing software, especially prototypes, is becoming cheaper. This will lead to increased demand for people who can decide what to build. AI Product Management has a bright future! Software is often written by teams that comprise Produc

In [8]:
images

[LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/AIProductManager-2_1200px-1.jpg),
 LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/The-Batch-ads-and-exclusive-banners---2024-12-16T174314.640--1-.png),
 LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--45-.png),
 LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/BIDENCHIPS-10_1200px.jpg),
 LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--47-.jpg),
 LoadedImage(url=https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--44-.gif)]

In [9]:
text_splitter = RecursiveTextSplitter()
image_describer = BLIPImageDescriber()

text_docs = text_splitter.split(text, loaded_data.url)
image_docs = [image_describer.describe(image) for image in images]
docs = text_docs + image_docs
docs

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[TextDocument(id='b0c2a28ce65fadc891e9f1f7388f8766c580740bbedcd365f2ad3429e2dc03c4-3f135ed6-7ab3-470e-bb23-606cc8d9bb93', content='issue 284\nNews\nDeepSeek Ups the Open Weights Ante\nU.S. Moves to Expand AI Export Restrictions\nAI Supercomputer on Your Desk\nCalibrating Contrast\n\n\nExplore Courses\nAI Newsletter\n\nCommunity\n\nResources\nCompany', source_url='https://www.deeplearning.ai/the-batch/issue-284/'),
 TextDocument(id='223c5a21f701afa70a873faff243339d9b591ef583a3e35e76ea897519c34a57-39883cc8-d053-458b-8b35-5da8e70f610b', content="✨ New course! Enroll in Reinforcement Fine-Tuning LLMs with GRPO Explore Courses AI Newsletter The Batch Andrew's Letter Data Points ML Research Blog Community Forum Events Ambassadors Ambassador Spotlight Resources Company About Careers Contact Start Learning Weekly Issues Andrew's Letters Data Points ML Research Business Science Culture Hardware AI Careers About Subscribe The Batch Weekly Issues issue 284 Published Jan 15, 2025 Reading time 13 m

In [10]:
embedding_func = SentenceTransformerTextEmbedding()
docs_embeddings = np.vstack([embedding_func.encode([doc.content]) for doc in docs])
docs_embeddings

[2025-05-25 02:02:51,149 | sentence_transformers.SentenceTransformer | INFO] -> Use pytorch device_name: cpu
[2025-05-25 02:02:51,149 | sentence_transformers.SentenceTransformer | INFO] -> Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 38.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 37.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.93it/s]
Batches: 100%|██████

array([[-0.07752761, -0.07712671, -0.00187728, ..., -0.00018483,
        -0.04130648,  0.03334473],
       [-0.07596719, -0.04356242,  0.04169593, ..., -0.04087131,
        -0.01327076,  0.02455314],
       [-0.00362282, -0.09913214,  0.05776122, ..., -0.01191531,
         0.0412215 ,  0.01541616],
       ...,
       [ 0.09326685,  0.0327084 , -0.00491121, ..., -0.02058907,
        -0.0873061 , -0.03950423],
       [ 0.02394519,  0.0708113 , -0.04790877, ...,  0.0354294 ,
        -0.01772317, -0.02503958],
       [-0.06404288,  0.10308387, -0.06203021, ...,  0.03419654,
         0.03382072,  0.08427884]], shape=(101, 384), dtype=float32)

In [15]:
adapted_embedding_func = ChromaTextEmbeddingAdapter(embedding_function=SentenceTransformerTextEmbedding())
vectorstore = ChromaVectorStore(collection_name='new_collection', 
                                 embedding_function=adapted_embedding_func)

[2025-05-25 02:03:26,345 | sentence_transformers.SentenceTransformer | INFO] -> Use pytorch device_name: cpu
[2025-05-25 02:03:26,346 | sentence_transformers.SentenceTransformer | INFO] -> Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [17]:
vectorstore.add_documents(documents=docs,
                          embeddings=docs_embeddings)

In [18]:
query = "What is computer?"
vectorstore.similarity_search(query, k=5)

Batches: 100%|██████████| 1/1 [00:00<00:00, 81.25it/s]


[ImageDocument(id='848f8ad5d57320b6a79a9ff5ec1244061baa479fac35136662280df95e1cd4eb-87a34771-e727-4071-987c-bed37bb19d14', content='a cartoon depicting two men in front of a computer', image=None, source_url='https://dl-staging-website.ghost.io/content/images/2025/01/AIProductManager-2_1200px-1.jpg', image_url='https://dl-staging-website.ghost.io/content/images/2025/01/AIProductManager-2_1200px-1.jpg'),
 TextDocument(id='3606b3a086420ec6180eff7a8613d6e277c849638e3fa4668e562f5799a5e5fd-59614c7f-8d02-4c14-ae5e-e56b0cd90081', content='can be connected to run models such as Meta’s Llama 3.1 405B. Complete specifications are not yet available. Project Digits runs Nvidia’s DGX operating system, a flavor of Ubuntu Linux. The system is based on a GB10 system-on-a-chip that combines the Nvidia Blackwell GPU architecture (which serves as the basis for its latest B100 GPUs) and Grace CPU architecture (designed to manage AI workloads in data centers), connected via high-bandwidth NVLink interconne

In [19]:
from langchain.prompts import PromptTemplate

News_article_prompt_template = PromptTemplate(
    input_variables=["context", "user_query"],
    template="""
You are an advanced AI assistant designed to provide comprehensive and well-reasoned answers to user queries based on relevant news articles. You have access to the following context, which may include text from news articles and associated images (described in text form).

Context:
{context}

Instructions:
- Carefully read the context above, which contains multiple news articles relevant to the user's question.
- If any images are described (e.g., captions, OCR, extracted text), incorporate them into your analysis.
- Summarize and synthesize the information to answer the user's question.
- Ensure your answer is accurate, relevant, and concise.

User query:
{user_query}

Your answer:
"""
)

In [20]:
rag_llm = OllamaRAGLLM(vectorstore=vectorstore, prompt_template=News_article_prompt_template)

In [25]:
user_query = 'computer'
res = rag_llm.query(user_query, k=1)

Batches: 100%|██████████| 1/1 [00:00<00:00, 134.67it/s]


In [26]:
res

RAGLLMResponse(user_query='computer')

In [27]:
print(res.llm_resopnse)

Based on the context provided, I'll provide a comprehensive answer to the user's query "computer".

From the given context, it appears that computers have become an integral part of our daily lives. The image depicts two men sitting in front of a computer, suggesting that they are engaging with this technology for various purposes.

News articles and relevant information:

1. **Computing Industry Trends**: According to recent news articles, the computing industry has seen significant advancements in the field of artificial intelligence (AI) and machine learning (ML). These technologies have enabled computers to become more intelligent and efficient, making them an essential tool for businesses, individuals, and organizations.
2. **Computer Security Concerns**: Another article highlights the growing concern about computer security threats, such as cyber attacks and data breaches. This emphasizes the importance of using secure computers and protecting personal data from malicious online 

In [28]:
res.relevant_docs

[ImageDocument(id='848f8ad5d57320b6a79a9ff5ec1244061baa479fac35136662280df95e1cd4eb-87a34771-e727-4071-987c-bed37bb19d14', content='a cartoon depicting two men in front of a computer', image=None, source_url='https://dl-staging-website.ghost.io/content/images/2025/01/AIProductManager-2_1200px-1.jpg', image_url='https://dl-staging-website.ghost.io/content/images/2025/01/AIProductManager-2_1200px-1.jpg')]