# Text Search


In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import json

load_dotenv()


True

In [2]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [3]:
pd.set_option('display.max_colwidth', 100)

In [5]:
input_filename = 'data/all_articles.csv'
raw_df = pd.read_csv(input_filename)

In [6]:
df = raw_df.copy()
df.shape

(3549, 3)

In [7]:
df

Unnamed: 0,article_url,text,image
0,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,
1,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,
2,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",
3,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,
4,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",
...,...,...,...
3544,https://www.deeplearning.ai/the-batch/issue-287/,"Dear friends, A “10x engineer” — a widely accepted concept in tech — purportedly has 10 times th...",['https://dl-staging-website.ghost.io/content/images/2025/02/10x_1200px_6-1.jpg']
3545,https://www.deeplearning.ai/the-batch/issue-287/,A MESSAGE FROM DEEPLEARNING.AI Learn in detail how transformer-based large language models work ...,['https://dl-staging-website.ghost.io/content/images/2025/02/The-Batch-ads-and-exclusive-banners...
3546,https://www.deeplearning.ai/the-batch/issue-287/,"Training for Computer Use As Anthropic, Google, OpenAI, and others roll out agents that are capa...",['https://dl-staging-website.ghost.io/content/images/2025/02/UITARS.png']
3547,https://www.deeplearning.ai/the-batch/issue-287/,Gemini Thinks Faster Google updated the December-vintage reasoning model Gemini 2.0 Flash Thinki...,['https://dl-staging-website.ghost.io/content/images/2025/02/FLASH2THINKING.png']


# Indexing

## Gemini Text Embeddings

In [8]:
doc_embeddings = genai.embed_content(
    model="models/text-embedding-004", content=df.text
)["embedding"]

In [None]:
with open('data/article_embeddings.json', 'w') as file:
    json.dump(doc_embeddings)

## Milvus Configuration

In [10]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./the_batch.db")

collection_name = "the_batch_text_rag"

In [26]:
# if milvus_client.has_collection(collection_name):
#     milvus_client.drop_collection(collection_name)

In [None]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=768,  # "models/text-embedding-004" dimension
    vector_field_name= "vector",
    auto_id=True,
    metric_type="IP",
    consistency_level="Strong",
)

### Indexing data

In [None]:
data = []

for index, row in df.iterrows():
    data.append({
        "vector": doc_embeddings[index],
        "text": row.text,
        "article_url": row.article_url, 
        "image_url": row.image
        })

In [13]:
milvus_client.insert(collection_name=collection_name, data=data)

{'insert_count': 3549, 'ids': [455967111708934144, 455967111708934145, 455967111708934146, 455967111708934147, 455967111708934148, 455967111708934149, 455967111708934150, 455967111708934151, 455967111708934152, 455967111708934153, 455967111708934154, 455967111708934155, 455967111708934156, 455967111708934157, 455967111708934158, 455967111708934159, 455967111708934160, 455967111708934161, 455967111708934162, 455967111708934163, 455967111708934164, 455967111708934165, 455967111708934166, 455967111708934167, 455967111708934168, 455967111708934169, 455967111708934170, 455967111708934171, 455967111708934172, 455967111708934173, 455967111708934174, 455967111708934175, 455967111708934176, 455967111708934177, 455967111708934178, 455967111708934179, 455967111708934180, 455967111708934181, 455967111708934182, 455967111708934183, 455967111708934184, 455967111708934185, 455967111708934186, 455967111708934187, 455967111708934188, 455967111708934189, 455967111708934190, 455967111708934191, 455967111

# Index Search

In [14]:
question = "What can you tell me about deepseek?"

In [15]:
question_embedding = genai.embed_content(
    model="models/text-embedding-004", content=question
)["embedding"]

In [23]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[question_embedding],
    limit=1,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["text", "article_url", "image_url"],
)

In [25]:
retrieved_lines_with_distances = [
    (
        res["distance"],
        res["entity"]["article_url"],
        res["entity"]["image_url"],
        res["entity"]["text"].replace(u'\u2019', u'\'')
    ) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        0.6463066339492798,
        "https://www.deeplearning.ai/the-batch/issue-285/",
        "['https://dl-staging-website.ghost.io/content/images/2025/01/The-Batch-ads-and-exclusive-banners--5-.png', 'https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--47-.png']",
        "A MESSAGE FROM\u00a0DEEPLEARNING.AI Explore Computer Use, which enables AI assistants to navigate, use, and accomplish tasks on computers. Taught by Colt Steele, this free course covers Anthropic's model family, its approach to AI research, and\u00a0 capabilities like multimodal prompts and prompt caching. Sign up for free News DeepSeek Sharpens Its Reasoning A new open model rivals OpenAI's o1, and it's free to use or modify. What's new: DeepSeek released DeepSeek-R1 , a large language model that executes long lines of reasoning before producing output. The code and weights are licensed freely for commercial and personal use, including training new models on R1 outputs. The paper provides

# LLM Answering

In [26]:
context = "\n".join(
    [line_with_distance[3] for line_with_distance in retrieved_lines_with_distances]
)
print(context)

A MESSAGE FROM DEEPLEARNING.AI Explore Computer Use, which enables AI assistants to navigate, use, and accomplish tasks on computers. Taught by Colt Steele, this free course covers Anthropic's model family, its approach to AI research, and  capabilities like multimodal prompts and prompt caching. Sign up for free News DeepSeek Sharpens Its Reasoning A new open model rivals OpenAI's o1, and it's free to use or modify. What's new: DeepSeek released DeepSeek-R1 , a large language model that executes long lines of reasoning before producing output. The code and weights are licensed freely for commercial and personal use, including training new models on R1 outputs. The paper provides an up-close look at the training of a high-performance model that implements a chain of thought without explicit prompting. ( DeepSeek-R1-lite-preview came out in November with fewer parameters and a different base model.) Mixture of experts (MoE) basics: The MoE architecture uses different subsets of its para

In [27]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the articles provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [28]:
gemini_model = genai.GenerativeModel(
    "gemini-2.0-flash-lite-preview-02-05", system_instruction=SYSTEM_PROMPT
)


In [29]:
response = gemini_model.generate_content(USER_PROMPT)
print(response.text)

DeepSeek-R1 is a large language model that executes long lines of reasoning before producing output. The code and weights are licensed freely for commercial and personal use, including training new models on R1 outputs. DeepSeek-R1 is a version of DeepSeek-V3-Base that was fine-tuned over four stages to enhance its ability to process a chain of thought (CoT). It's a mixture-of-experts transformer with 671 billion total parameters, 37 billion of which are active at any given time, and it processes 128,000 tokens of input context. Access to the model via DeepSeek's API costs $0.55 per million input tokens ($0.14 for cached inputs) and $2.19 per million output tokens. (In comparison, o1 costs $15 per million input tokens, $7.50 for cached inputs, and $60 per million output tokens.)



In [None]:
# ULEPSZENIA: chunkowanie dokumentów