# Text Search


In [3]:
import pandas as pd
from dotenv import load_dotenv
import os
import json

load_dotenv()


True

In [4]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [5]:
pd.set_option('display.max_colwidth', 100)

In [48]:
# input_filename = 'data/single_articles_cleaned.csv'
input_filename = 'data/weekly_articles_cleaned.csv'
raw_df = pd.read_csv(input_filename)

In [49]:
df = raw_df.copy()
df.shape

(1802, 3)

In [51]:
df.tail(5)

Unnamed: 0,article_url,text,images
1797,https://www.deeplearning.ai/the-batch/issue-287/,"Dear friends, A “10x engineer” — a widely accepted concept in tech — purportedly has 10 times th...",['https://dl-staging-website.ghost.io/content/images/2025/02/10x_1200px_6-1.jpg']
1798,https://www.deeplearning.ai/the-batch/issue-287/,A MESSAGE FROM DEEPLEARNING.AI Learn in detail how transformer-based large language models work ...,['https://dl-staging-website.ghost.io/content/images/2025/02/The-Batch-ads-and-exclusive-banners...
1799,https://www.deeplearning.ai/the-batch/issue-287/,"Training for Computer Use As Anthropic, Google, OpenAI, and others roll out agents that are capa...",['https://dl-staging-website.ghost.io/content/images/2025/02/UITARS.png']
1800,https://www.deeplearning.ai/the-batch/issue-287/,Gemini Thinks Faster Google updated the December-vintage reasoning model Gemini 2.0 Flash Thinki...,['https://dl-staging-website.ghost.io/content/images/2025/02/FLASH2THINKING.png']
1801,https://www.deeplearning.ai/the-batch/issue-287/,"Okay, But Please Don’t Stop Talking Even cutting-edge, end-to-end, speech-to-speech systems like...",['https://dl-staging-website.ghost.io/content/images/2025/02/MOSHI.gif']


# Indexing

## Gemini Text Embeddings

In [52]:
doc_embeddings = genai.embed_content(
    model="models/text-embedding-004", content=df.text
)["embedding"]

In [53]:
with open('embeddings_weekly.txt', 'w') as file:
    file.write(str(doc_embeddings))

In [54]:
with open('embeddings_weekly.txt', 'r') as file:
    doc_embeddings = json.load(file)

## Milvus Configuration

In [2]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./thebatch_text.db")

collection_name = "the_batch_text_rag"

2025-02-11 13:47:11,221 [ERROR][_create_connection]: Failed to create new connection using: f2c541d872c44e7db56f354dce2fe2d3 (milvus_client.py:920)


MilvusException: <MilvusException: (code=2, message=Fail connecting to server on unix:/var/folders/wz/btzptp0d0ksbw62y6b78hpy40000gn/T/tmp22rprcll_thebatch_text.db.sock, illegal connection params or server unavailable)>

In [26]:
# if milvus_client.has_collection(collection_name):
#     milvus_client.drop_collection(collection_name)

In [None]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=768,  # "models/text-embedding-004" dimension
    vector_field_name= "text_vector",
    auto_id=True,
    metric_type="IP",
    consistency_level="Strong",
)

### Indexing data

In [None]:
data = []

for index, row in df.iterrows():
    data.append({
        "text_vector": doc_embeddings[index],
        "text": row.text,
        "article_url": row.article_url, 
        "image_url": row.images # .image_header_cleaned
        })

In [58]:
milvus_client.insert(collection_name=collection_name, data=data)

{'insert_count': 1802, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

# Index Search

In [59]:
question = "What can you tell me about deepseek?"

In [60]:
question_embedding = genai.embed_content(
    model="models/text-embedding-004", content=question
)["embedding"]

In [None]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[question_embedding],
    limit=1,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["text", "article_url", "image_url"],
)

In [65]:
retrieved_lines_with_distances = [
    (
        res["distance"],
        res["entity"]["article_url"],
        res["entity"]["image_url"],
        res["entity"]["text"].replace(u'\u2019', u'\'')
    ) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        0.6463066339492798,
        "https://www.deeplearning.ai/the-batch/issue-285/",
        "['https://dl-staging-website.ghost.io/content/images/2025/01/The-Batch-ads-and-exclusive-banners--5-.png', 'https://dl-staging-website.ghost.io/content/images/2025/01/unnamed--47-.png']",
        "A MESSAGE FROM\u00a0DEEPLEARNING.AI Explore Computer Use, which enables AI assistants to navigate, use, and accomplish tasks on computers. Taught by Colt Steele, this free course covers Anthropic's model family, its approach to AI research, and\u00a0 capabilities like multimodal prompts and prompt caching. Sign up for free News DeepSeek Sharpens Its Reasoning A new open model rivals OpenAI's o1, and it's free to use or modify. What's new: DeepSeek released DeepSeek-R1 , a large language model that executes long lines of reasoning before producing output. The code and weights are licensed freely for commercial and personal use, including training new models on R1 outputs. The paper provides

# LLM Answering

In [38]:
context = "\n".join(
    [line_with_distance[3] for line_with_distance in retrieved_lines_with_distances]
)
print(context)

A new model from Hangzhou upstart DeepSeek delivers outstanding performance and may change the equation for training costs.What's new: DeepSeek-V3 is an open large language model that outperforms Llama 3.1 405B and GPT-4o on key benchmarks and achieves exceptional scores in coding and math. The weights are open except for applications that involve military uses, harming minors, generating false information, and similar restrictions. You can download them here.Mixture of experts (MoE) basics: The MoE architecture uses different subsets of its parameters to process different inputs. Each MoE layer contains a group of neural networks, or experts, preceded by a gating module that learns to choose which one(s) to use based on the input. In this way, different experts learn to specialize in different types of examples. Because not all parameters are used to produce any given output, the network uses less energy and runs faster than models of similar size that use all parameters to process ev

In [39]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the articles provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [40]:
gemini_model = genai.GenerativeModel(
    "gemini-2.0-flash-lite-preview-02-05", system_instruction=SYSTEM_PROMPT
)


In [41]:
gemini_model

genai.GenerativeModel(
    model_name='models/gemini-2.0-flash-lite-preview-02-05',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction='\nHuman: You are an AI assistant. You are able to find answers to the questions from the articles provided.\n',
    cached_content=None
)

In [178]:
response = gemini_model.generate_content(USER_PROMPT)
print(response.text)

DeepSeek-V3 is an open large language model that outperforms Llama 3.1 405B and GPT-4o on key benchmarks, especially in coding and math tasks. It uses a mixture-of-experts (MoE) architecture. This model uses different subsets of its parameters to process different inputs. It is also a mixture-of-experts (MoE) transformer that comprises 671 billion parameters, of which 37 billion are active at any moment. The training cost was $5.6 million to train and was trained on roughly 15 trillion tokens.



In [None]:
# ULEPSZENIA: chunkowanie dokumentów