# Pip

In [31]:
# 1. Force upgrade the critical libraries
%pip install -U langchain langchain-core langchain-openai langchain-community pydantic

# 2. IMPORTANT: You must restart the kernel after running this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

I0000 00:00:1767004252.089963 11876611 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Collecting langchain-openai
  Using cached langchain_openai-1.1.6-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-openai
  Using cached langchain_openai-1.1.6-py3-none-any.whl.metadata (2.6 kB)
Using cached langchain_openai-1.1.6-py3-none-any.whl (84 kB)
Using cached langchain_openai-1.1.6-py3-none-any.whl (84 kB)
[0mInstalling collected packages: langchain-openai
  Attempting uninstall: langchain-openai
[0m    Found existing installation: langchain-openai 1.1.3
    Uninstalling langchain-openai-1.1.3:
      Successfully uninstalled langchain-openai-1.1.3
Installing collected packages: langchain-openai
  Attempting uninstall: langchain-openai
[0m    Found existing installation: langchain-openai 1.1.3
    Uninstalling langchain-openai-1.1.3:
      Successfully uninstalled langchain-openai-1.1.3
[0mSuccessfully installed langchain-openai-1.1.6
[0mSuccessfully installed langchain-openai-1.1.6
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may 

In [None]:
%pip install sentence-transformers gensim datasets

In [None]:
%pip install --upgrade --force-reinstall datasets sentence-transformers

In [None]:
%pip install --upgrade --force-reinstall gensim numpy

In [None]:
%pip install -U sentence-transformers transformers flash-attn

In [None]:
# Install the missing local server engine
%pip install "pymilvus[milvus_lite]"

# CRITICAL: Restart your kernel again after this!

In [None]:
%pip install pymilvus

In [None]:
%pip install -U pymilvus milvus-lite

In [None]:
# 1. Remove the libraries causing the binary conflict
# (These are optional speed-boosters for Pandas, not required for functionality)
%pip uninstall -y bottleneck numexpr

# 2. Force install a compatible version of Pandas and PyArrow
# This ensures your Pandas matches your current NumPy version
%pip install --upgrade pandas pyarrow numpy>=2.0

# 3. CRITICAL: Restart your kernel now!
# Click "Kernel" -> "Restart Kernel" in the menu.

In [None]:
# 1. Downgrade NumPy to the 1.x version (most compatible)
%pip install "numpy<2.0"

# 2. You MUST restart your kernel after this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

In [None]:
%pip install langchain-milvus langchain-community

In [None]:
%pip install fastapi uvicorn nest-asyncio

# Simple scraping agent

In [None]:
import os
import requests
import pandas as pd
from io import StringIO
from pydantic import BaseModel, Field
from langchain_classic.agents import AgentExecutor, create_react_agent
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

In [None]:
# Connected to LLM running locally
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="local-model",
    temperature=0,
    streaming=True
)

# Define the Tool
@tool
def fetch_csv_dataset(url: str) -> str:
    """
    Downloads a CSV dataset from a URL and returns a summary.
    Input should be the full URL string.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse CSV
        content = response.content.decode('utf-8')
        df = pd.read_csv(StringIO(content), on_bad_lines='skip')
        
        return (
            f"SUCCESS: Downloaded data from {url}\n"
            f"Shape: {df.shape}\n"
            f"Columns: {list(df.columns)}\n"
            f"First 5 rows:\n{df.head().to_string()}"
        )
    except Exception as e:
        return f"ERROR: {str(e)}"

tools = [fetch_csv_dataset]

# Define the ReAct Prompt (Hardcoded for stability)
# This teaches the model explicitly how to think and act.
template = '''Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}'''

prompt = PromptTemplate.from_template(template)

# 4. Create the ReAct Agent
# This uses simple text generation, avoiding the Pydantic/Tool Binding error completely.
agent = create_react_agent(llm, tools, prompt)

# 5. Create the Executor
agent_executor = AgentExecutor(
    agent=agent, 
    tools=tools, 
    verbose=True, 
    handle_parsing_errors=True # IMPORTANT for local models
)

print("‚úÖ ReAct Agent built successfully.")

In [None]:
test_url = "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"
query = f"Download the dataset from {test_url} and tell me the columns."

response = agent_executor.invoke({"input": query})
print("\n--- FINAL ANSWER ---")
print(response['output'])

# Vector embedding of 2022-2024 news

### Load and filter data

In [None]:
import pandas as pd
df = pd.read_csv("dataset/guardian_climate_news_corpus.csv")

df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True).dt.tz_convert(None)
df = df[df['date'].dt.year >= 2022].copy()

df = df[df['label'] != 'UNRELATED_TO_CLIMATE'].copy()

df.reset_index(drop=True, inplace=True)
df

In [None]:
df["label"].value_counts()

In [None]:
df.dtypes

### Making vector embeddings

In [None]:
import pandas as pd
import numpy as np
import ast
from langchain_openai import OpenAIEmbeddings

# 1. SETUP: Load your data
# ------------------------------------------------------------------
# df = pd.read_csv("your_data.csv") # Uncomment to load your real file
# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# 2. CREATE UNIFIED TEXT REPRESENTATION
# ------------------------------------------------------------------
# Instead of training a separate Word2Vec model, we will format the metadata 
# into a structured string that the 8B model can "read" and understand semantically.
# This technique is often called "Text Serialization".

def serialize_row_for_embedding(row):
    # Parse tags safely
    try:
        tags = ast.literal_eval(row['tags']) if isinstance(row['tags'], str) else row['tags']
        tags_str = ", ".join(tags)
    except:
        tags_str = "None"
        
    # Create a rich text block that describes the entire data point
    # We put the most important semantic info (Category, Tags, Date) at the start or end.
    combined_text = (
        f"Category: {row['category']}. "
        f"Tags: {tags_str}. "
        f"Date: {row['date'].strftime('%Y-%m-%d')}. "
        f"Title: {row['title']}\n"
        f"Content: {row['body']}"
    )
    return combined_text

# Apply the function
df['serialized_text'] = df.apply(serialize_row_for_embedding, axis=1)

# 3. EMBED WITH LOCAL LLAMA MODEL (via OpenAI Compatible API)
# ------------------------------------------------------------------
# Assuming you are running the model in LM Studio / Ollama on port 1234
# Check your local server settings for the exact URL.

embedding_model = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1", # Point to your local server
    api_key="lm-studio",                 # Arbitrary key
    model="Qwen3-Embedding-4B-GGUF",     # The specific model name loaded in your server
    check_embedding_ctx_length=False     # Important for long texts
)

print("Starting embedding process... (This may take time depending on GPU)")

# We process in batches to be safe with memory
batch_size = 32
all_embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['serialized_text'].iloc[i:i+batch_size].tolist()
    
    # Generate embeddings for the batch
    # embed_documents returns a list of lists (vectors)
    batch_embeddings = embedding_model.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)
    
    print(f"Processed rows {i} to {min(i+batch_size, len(df))}")

# 4. STORE RESULTS
# ------------------------------------------------------------------
# Convert to numpy array for use in classifiers or Vector DB
final_features = np.array(all_embeddings)

print(f"Final Feature Matrix Shape: {final_features.shape}")

# Optional: Add back to DataFrame
df['embedding_vector'] = list(final_features)

In [None]:
import numpy as np

# 1. Save the DataFrame (Contains text, metadata, and vectors)
# Pickle is better than CSV because it preserves lists/arrays perfectly.
df.to_pickle("climate_news_data.pkl")

# 2. Save the Raw Numpy Array (Just in case)
# This is the safest way to store the pure mathematical vectors.
np.save("climate_vectors.npy", final_features)

print("Saved 'climate_news_data.pkl' and 'climate_vectors.npy' to disk.")

### Storing embeddings with Milvus

In [None]:
import pandas as pd
import numpy as np

print("üöÄ Attempting to rescue data...")

# 1. Load the Pickle
# Since we are on NumPy 2.x (installed above), this will read the file correctly.
df = pd.read_pickle("climate_news_data.pkl")
print(f"‚úÖ Pickle loaded successfully. Shape: {df.shape}")

# 2. Save as Parquet
# We drop the vector column if it exists to keep the file light (we have the .npy file)
if 'embedding_vector' in df.columns:
    df = df.drop(columns=['embedding_vector'])

df.to_parquet("climate_news_data.parquet")
print("‚úÖ SUCCESS: Data saved to 'climate_news_data.parquet'")

# 3. Verify Vector File
# This usually loads fine regardless of version, but let's check.
vectors = np.load("climate_vectors.npy")
print(f"‚úÖ SUCCESS: Vectors verified. Shape: {vectors.shape}")

In [None]:
import pandas as pd
import numpy as np
from pymilvus import MilvusClient, DataType

# 1. RELOAD YOUR SAVED DATA
# ------------------------------------------------------------------
print("üîÑ Reloading rescued data...")
df = pd.read_parquet("climate_news_data.parquet")
final_features = np.load("climate_vectors.npy")
print(f"‚úÖ Data Loaded. Articles: {len(df)} | Vector Dim: {final_features.shape[1]}")

# 2. SETUP MILVUS LITE
# ------------------------------------------------------------------
client = MilvusClient("./climate_news.db")
COLLECTION_NAME = "climate_articles"

# 3. DEFINE THE SCHEMA
# ------------------------------------------------------------------
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)

schema = client.create_schema(auto_id=True, enable_dynamic_field=True)

# Add Fields
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=2560) # Matches your Qwen size
schema.add_field(field_name="category", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="date", datatype=DataType.VARCHAR, max_length=50)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)

# 4. DEFINE INDEX
# ------------------------------------------------------------------
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector", 
    index_type="AUTOINDEX", 
    metric_type="COSINE"
)

# 5. CREATE COLLECTION
# ------------------------------------------------------------------
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params
)
print(f"‚úÖ Collection '{COLLECTION_NAME}' created.")

# 6. INSERT DATA
# ------------------------------------------------------------------
data_to_insert = []
print("Preparing data for insertion...")

for idx, row in df.iterrows():
    vector_list = final_features[idx].tolist()
    date_str = row['date'].strftime('%Y-%m-%d') if pd.notnull(row['date']) else ""
    
    entry = {
        "vector": vector_list,
        "text": str(row['body']),
        "title": str(row['title']),
        "category": str(row['category']),
        "date": date_str,
        "tags": str(row['tags'])
    }
    data_to_insert.append(entry)

# Insert in batches
batch_size = 100
total_inserted = 0

for i in range(0, len(data_to_insert), batch_size):
    batch = data_to_insert[i:i+batch_size]
    res = client.insert(collection_name=COLLECTION_NAME, data=batch)
    total_inserted += res['insert_count']
    print(f"Inserted batch {i} to {i+len(batch)}...")

print(f"‚úÖ SUCCESS! Stored {total_inserted} articles in 'climate_news.db'")

In [None]:
import pandas as pd
import numpy as np
from pymilvus import MilvusClient, DataType
import os
import shutil

# 0. CLEANUP OLD CORRUPTED DB (Optional but recommended)
# ------------------------------------------------------------------
db_path = "./climate_news.db"
if os.path.exists(db_path):
    print(f"‚ö†Ô∏è Found existing database at {db_path}. Removing it...")
    try:
        if os.path.isdir(db_path):
            shutil.rmtree(db_path)
        else:
            os.remove(db_path)
        print("‚úÖ Old database removed.")
    except Exception as e:
        print(f"Could not remove old db: {e}")

# 1. RELOAD YOUR SAVED DATA
# ------------------------------------------------------------------
print("üîÑ Reloading rescued data...")
df = pd.read_parquet("climate_news_data.parquet")
final_features = np.load("climate_vectors.npy")
print(f"‚úÖ Data Loaded. Articles: {len(df)} | Vector Dim: {final_features.shape[1]}")

# 2. SETUP MILVUS LITE (Fresh instance)
# ------------------------------------------------------------------
# Create a fresh client pointing to a clean database file
client = MilvusClient(uri=db_path)
COLLECTION_NAME = "climate_articles"

# 3. DEFINE THE SCHEMA
# ------------------------------------------------------------------
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)
    print(f"Dropped existing collection '{COLLECTION_NAME}'")

schema = client.create_schema(auto_id=True, enable_dynamic_field=True)

# Add Fields
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=2560) # Matches your Qwen size
schema.add_field(field_name="category", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="date", datatype=DataType.VARCHAR, max_length=50)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)

# 4. DEFINE INDEX
# ------------------------------------------------------------------
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector", 
    index_type="AUTOINDEX", 
    metric_type="COSINE"
)

# 5. CREATE COLLECTION
# ------------------------------------------------------------------
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params
)
print(f"‚úÖ Collection '{COLLECTION_NAME}' created.")

# 6. INSERT DATA
# ------------------------------------------------------------------
data_to_insert = []
print("Preparing data for insertion...")

for idx, row in df.iterrows():
    vector_list = final_features[idx].tolist()
    date_str = row['date'].strftime('%Y-%m-%d') if pd.notnull(row['date']) else ""
    
    entry = {
        "vector": vector_list,
        "text": str(row['body']),
        "title": str(row['title']),
        "category": str(row['category']),
        "date": date_str,
        "tags": str(row['tags'])
    }
    data_to_insert.append(entry)

# Insert in batches
batch_size = 100
total_inserted = 0

for i in range(0, len(data_to_insert), batch_size):
    batch = data_to_insert[i:i+batch_size]
    res = client.insert(collection_name=COLLECTION_NAME, data=batch)
    total_inserted += res['insert_count']
    print(f"Inserted batch {i} to {i+len(batch)}...")

print(f"‚úÖ SUCCESS! Stored {total_inserted} articles in '{db_path}'")

# RAG Framework

In [None]:
from pymilvus import MilvusClient

client = MilvusClient(uri="./climate_news.db")

In [2]:
import os
from langchain_milvus import Milvus
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 1. SETUP MODELS
embeddings = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="Qwen3-Embedding-4B-GGUF",
    check_embedding_ctx_length=False
)

llm = ChatOpenAI(
    base_url="http://127.0.0.1:8000/v1", # Verify this is your Ministral server port
    api_key="local-key",
    model="mistralai/Ministral-3-14B-Reasoning-2512",
    temperature=0.1
)

# 2. CONNECT TO VECTOR STORE
# Note: Ensure climate_news.db is NOT open in any other software
vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": "./climate_news.db"},
    collection_name="climate_articles",
    text_field="text",
    auto_id=True
)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# 3. DEFINE RAG LOGIC
template = """You are a specialized Climate News Assistant.
Use the context below to answer. If unsure, say you can't find it.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:"""

prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
print("‚úÖ RAG Chain initialized.")

‚úÖ RAG Chain initialized.


In [3]:
# Test the system
query = "Tell me any news about tiny snails on Atlantic Island"
response = rag_chain.invoke(query)

In [4]:
print(f"‚ùì Question: {query}\n")
print("ü§ñ Agent Answer:")
print(response)

‚ùì Question: Tell me any news about tiny snails on Atlantic Island

ü§ñ Agent Answer:
More than 1,300 tiny, critically endangered snails have been set free to roam on an island off the coast of Morocco after a breeding programme rescued two obscure species from the brink of extinction. The Desertas Island land snails had not been recorded for more than 100 years and were believed to have disappeared from their natural habitat on the windswept, mountainous island of Deserta Grande, close to Portugal-owned Madeira. Experts at the Instituto das Florestas e Conserva√ß√£o da Natureza (IFCN) rediscovered minute populations of two species of the snail, each consisting of fewer than 200 survivors, in conservation expeditions between 2012 and 2017 amid fears that invasive predators might have eaten the pea-sized molluscs into oblivion. The snails were taken to zoos in the UK and France, with 60 flown to Chester zoo, where the conservation science team liaised with experts in Madeira and const

# Embedding PDFs

In [14]:
from pymilvus import MilvusClient
import os
import shutil

DB_PATH = "./climate_news.db"
COLLECTION_NAME = "research_papers"

client = MilvusClient(uri=DB_PATH)

if client.has_collection(COLLECTION_NAME):
    print(f"‚ö†Ô∏è Dropping old collection '{COLLECTION_NAME}'")
    client.drop_collection(COLLECTION_NAME)

print("‚úÖ Clean slate ready.")

‚ö†Ô∏è Dropping old collection 'research_papers'
‚úÖ Clean slate ready.


In [15]:
import requests
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

PAPER_FOLDER = "./research_paper"
EMBED_URL = "http://127.0.0.1:1234/v1/embeddings"
MODEL_NAME = "text-embedding-qwen3-embedding-4b"

loader = PyPDFDirectoryLoader(PAPER_FOLDER)
raw_docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80)
docs = splitter.split_documents(raw_docs)

def embed_batch(texts):
    resp = requests.post(
        EMBED_URL,
        json={"model": MODEL_NAME, "input": texts},
        timeout=60
    )
    resp.raise_for_status()
    return [x["embedding"] for x in resp.json()["data"]]

vectors = []
texts = []
metas = []

batch_size = 20

for i in range(0, len(docs), batch_size):
    batch = docs[i:i+batch_size]
    batch_texts = [d.page_content.strip() for d in batch]

    emb = embed_batch(batch_texts)

    vectors.extend(emb)
    texts.extend(batch_texts)

    for d in batch:
        metas.append({
            "source": d.metadata.get("source", ""),
            "page": int(d.metadata.get("page", -1))
        })

    print(f"‚úÖ Embedded {i+len(batch)} / {len(docs)}")


‚úÖ Embedded 20 / 411
‚úÖ Embedded 40 / 411
‚úÖ Embedded 40 / 411
‚úÖ Embedded 60 / 411
‚úÖ Embedded 60 / 411
‚úÖ Embedded 80 / 411
‚úÖ Embedded 80 / 411
‚úÖ Embedded 100 / 411
‚úÖ Embedded 100 / 411
‚úÖ Embedded 120 / 411
‚úÖ Embedded 120 / 411
‚úÖ Embedded 140 / 411
‚úÖ Embedded 140 / 411
‚úÖ Embedded 160 / 411
‚úÖ Embedded 160 / 411
‚úÖ Embedded 180 / 411
‚úÖ Embedded 180 / 411
‚úÖ Embedded 200 / 411
‚úÖ Embedded 200 / 411
‚úÖ Embedded 220 / 411
‚úÖ Embedded 220 / 411
‚úÖ Embedded 240 / 411
‚úÖ Embedded 240 / 411
‚úÖ Embedded 260 / 411
‚úÖ Embedded 260 / 411
‚úÖ Embedded 280 / 411
‚úÖ Embedded 280 / 411
‚úÖ Embedded 300 / 411
‚úÖ Embedded 300 / 411
‚úÖ Embedded 320 / 411
‚úÖ Embedded 320 / 411
‚úÖ Embedded 340 / 411
‚úÖ Embedded 340 / 411
‚úÖ Embedded 360 / 411
‚úÖ Embedded 360 / 411
‚úÖ Embedded 380 / 411
‚úÖ Embedded 380 / 411
‚úÖ Embedded 400 / 411
‚úÖ Embedded 400 / 411
‚úÖ Embedded 411 / 411
‚úÖ Embedded 411 / 411


In [16]:
from pymilvus import MilvusClient, DataType

DIM = len(vectors[0])
client = MilvusClient(uri=DB_PATH)

schema = client.create_schema(auto_id=True, enable_dynamic_field=False)

schema.add_field("id", DataType.INT64, is_primary=True)
schema.add_field("vector", DataType.FLOAT_VECTOR, dim=DIM)
schema.add_field("text", DataType.VARCHAR, max_length=65535)
schema.add_field("source", DataType.VARCHAR, max_length=1024)
schema.add_field("page", DataType.INT64)

index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)

client.create_collection(
    collection_name="research_papers",
    schema=schema,
    index_params=index_params
)

print("‚úÖ Collection created.")


‚úÖ Collection created.


In [17]:
data = []

for i in range(len(vectors)):
    data.append({
        "vector": vectors[i],
        "text": texts[i],
        "source": metas[i]["source"],
        "page": metas[i]["page"]
    })

batch_size = 100
for i in range(0, len(data), batch_size):
    client.insert(
        collection_name="research_papers",
        data=data[i:i+batch_size]
    )
    print(f"Inserted {i+batch_size} / {len(data)}")

print("üöÄ SUCCESS: research_papers indexed.")


Inserted 100 / 411
Inserted 200 / 411
Inserted 300 / 411
Inserted 400 / 411
Inserted 500 / 411
üöÄ SUCCESS: research_papers indexed.


In [19]:
query = "What does the paper say about climate mitigation strategies?"

query_emb = embed_batch([query])[0]

res = client.search(
    collection_name="research_papers",
    data=[query_emb],
    anns_field="vector",
    limit=3,
    output_fields=["text", "source", "page"]
)

for hit in res[0]:
    print("\n---")
    print("Score:", hit["distance"])
    print("Source:", hit.get("source", "N/A"))
    print("Page:", hit.get("page", "N/A"))
    print(hit["text"][:500])




---
Score: 0.5711402297019958
Source: research_paper/2401.09646v1.pdf
Page: 46
tions on climate change.\n Cite the documents provided in
the context.
StackExchange You‚Äôre an AI assistant generating answers to questions on
the website stackexchange on the topic {source}.
AppTek General You‚Äôre a helpful and harmless AI assistant.
OASST-1 You‚Äôre Open Assistant, an AI language model, developed
by Laion AI together with an open source community and
trained using crowdsourced data.
Dolly You‚Äôre an AI language model trained on data generated by
employees of databricks.
Llama-2 Sa

---
Score: 0.5260053277015686
Source: research_paper/2505.18653v1.pdf
Page: 7
Task 0-Shot 5-Shot Diff.
CDP-QA-Cities .55 (.10) .63 (.06) .07
CDP-QA-Corp. .53 (.11) .62 (.05) .09
CDP-QA-States .56 (.11) .64 (.06) .07
CDP-Topic-Cities .32 (.02) .35 (.01) .03
Climate Commit. .61 (.09) .67 (.04) .06
Climate Detection .57 (.15) .71 (.06) .14
Climate Eng .50 (.08) .54 (.05) .04
Climate NER .13 (.06) .20 (.05) .08

In [29]:
import os
from langchain_milvus import Milvus
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.embeddings import Embeddings

# ------------------------------------------------------------------
# 1. FAKE EMBEDDINGS (IMPORTANT)
# We already embedded research papers manually.
# LangChain must NOT try to re-embed stored documents.
# ------------------------------------------------------------------
class FakeEmbeddings(Embeddings):
    def embed_documents(self, texts): 
        return []

    def embed_query(self, text): 
        return []

# ------------------------------------------------------------------
# 2. REAL QUERY EMBEDDINGS (for search only)
# ------------------------------------------------------------------
query_embeddings = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="text-embedding-qwen3-embedding-4b",
    check_embedding_ctx_length=False
)

# ------------------------------------------------------------------
# 3. LLM
# ------------------------------------------------------------------
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",  # Ministral server
    api_key="local-key",
    model="nvidia/nemotron-3-nano",
    temperature=0.1
)

# ------------------------------------------------------------------
# 4. CONNECT TO RESEARCH PAPERS VECTOR STORE
# ------------------------------------------------------------------
vector_store = Milvus(
    embedding_function=FakeEmbeddings(),   # ‚¨ÖÔ∏è CRITICAL
    connection_args={"uri": "./climate_news.db"},
    collection_name="research_papers",
    vector_field="vector",                 # ‚¨ÖÔ∏è matches schema
    text_field="text",
    auto_id=True
)

def retrieve_docs(query: str, k: int = 3):
    # 1. Embed query explicitly (DENSE)
    query_vector = query_embeddings.embed_query(query)

    # 2. Search using vector directly
    docs = vector_store.similarity_search_by_vector(
        embedding=query_vector,
        k=k
    )
    return docs


# ------------------------------------------------------------------
# 5. PROMPT (paper-aware, citation-friendly)
# ------------------------------------------------------------------
template = """You are an academic research assistant.

You are given excerpts from research papers.
Use ONLY the provided context to answer when possible.
If the answer is not found, say you could not find this
information in the provided research papers.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER (cite sources if relevant):"""

prompt = PromptTemplate.from_template(template)

def format_docs(docs):
    blocks = []
    for d in docs:
        source = d.metadata.get("source", "unknown")
        page = d.metadata.get("page", "N/A")
        blocks.append(
            f"[Source: {source}, Page: {page}]\n{d.page_content}"
        )
    return "\n\n".join(blocks)

# ------------------------------------------------------------------
# 6. RAG CHAIN
# ------------------------------------------------------------------
rag_chain = (
    {
        "context": retrieve_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("‚úÖ Research Paper RAG Chain initialized.")


‚úÖ Research Paper RAG Chain initialized.


In [26]:
query = "How is ClimateGPT model trained, and what is the model architecture like?"
response = rag_chain.invoke(query)

In [27]:

print(f"‚ùì Question: {query}\n")
print("ü§ñ Agent Answer:")
print(response)

‚ùì Question: How is ClimateGPT model trained, and what is the model architecture like?

ü§ñ Agent Answer:

**Training**  
- ClimateGPT was pre‚Äëtrained on a compute cluster using a fork of the **Megatron‚ÄëLLM** repository‚ÄØ[Document‚ÄØ1, page‚ÄØ40].  
- After the pre‚Äëtraining phase, the model underwent **instruction fine‚Äëtuning (IFT)** to align its outputs with the expected format‚ÄØ[Document‚ÄØ1, page‚ÄØ40].

**Model Architecture**  
- ClimateGPT is built as an **auto‚Äëregressive language model** that employs an **optimized transformer architecture**‚ÄØ[Document‚ÄØ1, page‚ÄØ40].  

These details describe both the training pipeline and the structural design of the ClimateGPT models.


# Model with combined retriever

In [39]:
import json
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.documents import Document

# ------------------------------------------------------------------
# 1. LOAD COLLECTION REGISTRY (Step 1)
# ------------------------------------------------------------------
with open("db_description.json", "r") as f:
    COLLECTION_REGISTRY = json.load(f)

def format_registry():
    """Convert collection registry JSON into readable text for the LLM."""
    blocks = []
    for col in COLLECTION_REGISTRY["collections"]:
        blocks.append(
            f"""
Collection: {col['name']}
Type: {col['type']}
Domain: {col['domain']}
Granularity: {col['granularity']}
Strengths: {", ".join(col['strengths'])}
Weaknesses: {", ".join(col['weaknesses'])}
"""
        )
    return "\n".join(blocks)


# ------------------------------------------------------------------
# 2. MULTI-COLLECTION RETRIEVER (Step 2)
# ------------------------------------------------------------------
# One Milvus vector store per collection
research_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": "./climate_news.db"},
    collection_name="research_papers",
    text_field="text",
)

news_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": "./climate_news.db"},
    collection_name="climate_articles",
    text_field="text",
)

research_retriever = research_store.as_retriever(search_kwargs={"k": 3})
news_retriever = news_store.as_retriever(search_kwargs={"k": 3})


def retrieve_docs(query: str):
    """
    Retrieve from BOTH collections.
    Later we‚Äôll make this LLM-controlled.
    """
    docs = []
    docs.extend(research_retriever.invoke(query))
    docs.extend(news_retriever.invoke(query))
    return docs


def format_docs(docs):
    blocks = []
    for d in docs:
        source = d.metadata.get("source", "unknown")
        blocks.append(f"[SOURCE: {source}]\n{d.page_content}")
    return "\n\n".join(blocks)


# Wrap functions as Runnables ‚úÖ
retrieve_docs_runnable = RunnableLambda(retrieve_docs)
format_docs_runnable = RunnableLambda(format_docs)
registry_runnable = RunnableLambda(lambda _: format_registry())

In [40]:
import json
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# ------------------------------------------------------------------
# COLLECTION ROUTER PROMPT
# ------------------------------------------------------------------
router_template = """You are a routing assistant.

You are given a list of available data collections.
Your job is to decide which collections (if any) should be queried
to answer the user question.

Rules:
- Only select collections that are clearly relevant.
- If none are relevant, return "none".
- Return ONLY valid JSON.
- Do NOT explain your reasoning.

Available collections:
{registry}

User question:
{question}

Respond in this exact JSON format:
{{
  "collections": ["collection_name_1", "collection_name_2"]
}}

OR

{{
  "collections": "none"
}}
"""

router_prompt = PromptTemplate.from_template(router_template)

router_chain = (
    {
        "registry": registry_runnable,
        "question": RunnablePassthrough()
    }
    | router_prompt
    | llm
    | StrOutputParser()
)


def select_collections(query: str):
    """Stage 1: Decide which collections to use."""
    raw = router_chain.invoke(query)
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        # Hard fail-safe
        return {"collections": "none"}


In [55]:
COLLECTION_VECTORSTORES = {
    "research_papers": research_store,
    "climate_articles": news_store,
}

def retrieve_from_collections(query: str, collections):
    """Retrieve documents from selected collections using direct vector search."""
    docs = []

    # Embed the query once using the real embedding model
    query_vector = query_embeddings.embed_query(query)

    for name in collections:
        store = COLLECTION_VECTORSTORES.get(name)
        if store:
            # Use similarity_search_by_vector instead of similarity_search
            # to avoid re-embedding with FakeEmbeddings
            docs.extend(store.similarity_search_by_vector(embedding=query_vector, k=3))

    return docs

In [None]:
final_template = """You are a careful climate research assistant.

Use ONLY the provided context to answer the question.
If the context does not contain the answer, say so clearly.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

final_prompt = PromptTemplate.from_template(final_template)

final_chain = (
    {
        "context": RunnablePassthrough(),
        "question": RunnablePassthrough()
    }
    | final_prompt
    | llm
    | StrOutputParser()
)


In [49]:
def answer_query(query: str):
    """
    Full pipeline:
    1. Decide collections
    2. Retrieve context if needed
    3. Answer with or without RAG
    """

    routing = select_collections(query)
    collections = routing.get("collections")

    # --------------------------------------------------
    # CASE 1: No retrieval needed
    # --------------------------------------------------
    if collections == "none":
        return llm.invoke(
            f"The following question is not answered using the provided datasets. "
            f"Answer from general knowledge and say so explicitly if question is technical.\n\nQuestion: {query}"
        )
    
    if isinstance(collections, str):
        collections = [collections]

    # --------------------------------------------------
    # CASE 2: Targeted RAG
    # --------------------------------------------------
    docs = retrieve_from_collections(query, collections)
    context = format_docs(docs)

    if not context.strip():
        return "The selected datasets do not contain relevant information."

    return final_chain.invoke(
        {
            "context": context,
            "question": query
        }
    )


In [56]:
print(answer_query("How is ClimateGPT model trained, and what is the model architecture like?"))
print("\n\n\n")
print(answer_query("Tell me any news about tiny snails on Atlantic Island"))


**Training**  
- ClimateGPT was pre‚Äëtrained on a large corpus using a cluster provided by **MLFoundry**.  
- The pre‚Äëtraining was carried out with a fork of the **Megatron‚ÄëLLM** repository (EPFL LLM Team).  
- After the base pre‚Äëtraining, the models underwent **instruction fine‚Äëtuning (IFT)** to align them with the expected output format.

**Model Architecture**  
- ClimateGPT is an **auto‚Äëregressive language model** that employs an **optimized transformer architecture**.  
- It comes in several sizes (7‚ÄØB, 13‚ÄØB, and 70‚ÄØB parameters), with two distinct 7‚ÄØB variants trained from scratch.  
- The model accepts **text‚Äëonly input** and produces **text‚Äëonly output**.  

These details are drawn directly from the provided research paper excerpts.





The news reports that more than‚ÄØ1,300 tiny, critically‚Äëendangered snails have been released onto a wild refuge on Bugio‚ÄØ‚Äî‚ÄØa small island in the Madeira archipelago off the coast of Morocco. The snails belong to