# Pip

In [None]:
# 1. Force upgrade the critical libraries
%pip install -U langchain langchain-core langchain-openai langchain-community pydantic

# 2. IMPORTANT: You must restart the kernel after running this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

In [None]:
%pip install sentence-transformers gensim datasets

In [None]:
%pip install --upgrade --force-reinstall datasets sentence-transformers

In [None]:
%pip install --upgrade --force-reinstall gensim numpy

In [None]:
%pip install -U sentence-transformers transformers flash-attn

In [None]:
# Install the missing local server engine
%pip install "pymilvus[milvus_lite]"

# CRITICAL: Restart your kernel again after this!

In [None]:
%pip install pymilvus

In [None]:
# 1. Remove the libraries causing the binary conflict
# (These are optional speed-boosters for Pandas, not required for functionality)
%pip uninstall -y bottleneck numexpr

# 2. Force install a compatible version of Pandas and PyArrow
# This ensures your Pandas matches your current NumPy version
%pip install --upgrade pandas pyarrow numpy>=2.0

# 3. CRITICAL: Restart your kernel now!
# Click "Kernel" -> "Restart Kernel" in the menu.

In [None]:
# 1. Downgrade NumPy to the 1.x version (most compatible)
%pip install "numpy<2.0"

# 2. You MUST restart your kernel after this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

In [None]:
%pip install langchain-milvus langchain-community

# Simple scraping agent

In [None]:
import os
import requests
import pandas as pd
from io import StringIO
from pydantic import BaseModel, Field
from langchain_classic.agents import AgentExecutor, create_react_agent
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

In [None]:
# Connected to LLM running locally
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="local-model",
    temperature=0,
    streaming=True
)

# Define the Tool
@tool
def fetch_csv_dataset(url: str) -> str:
    """
    Downloads a CSV dataset from a URL and returns a summary.
    Input should be the full URL string.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse CSV
        content = response.content.decode('utf-8')
        df = pd.read_csv(StringIO(content), on_bad_lines='skip')
        
        return (
            f"SUCCESS: Downloaded data from {url}\n"
            f"Shape: {df.shape}\n"
            f"Columns: {list(df.columns)}\n"
            f"First 5 rows:\n{df.head().to_string()}"
        )
    except Exception as e:
        return f"ERROR: {str(e)}"

tools = [fetch_csv_dataset]

# Define the ReAct Prompt (Hardcoded for stability)
# This teaches the model explicitly how to think and act.
template = '''Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}'''

prompt = PromptTemplate.from_template(template)

# 4. Create the ReAct Agent
# This uses simple text generation, avoiding the Pydantic/Tool Binding error completely.
agent = create_react_agent(llm, tools, prompt)

# 5. Create the Executor
agent_executor = AgentExecutor(
    agent=agent, 
    tools=tools, 
    verbose=True, 
    handle_parsing_errors=True # IMPORTANT for local models
)

print("‚úÖ ReAct Agent built successfully.")

In [None]:
test_url = "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"
query = f"Download the dataset from {test_url} and tell me the columns."

response = agent_executor.invoke({"input": query})
print("\n--- FINAL ANSWER ---")
print(response['output'])

# Vector embedding of 2022-2024 news

### Load and filter data

In [None]:
import pandas as pd
df = pd.read_csv("dataset/guardian_climate_news_corpus.csv")

df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True).dt.tz_convert(None)
df = df[df['date'].dt.year >= 2022].copy()

df = df[df['label'] != 'UNRELATED_TO_CLIMATE'].copy()

df.reset_index(drop=True, inplace=True)
df

In [None]:
df["label"].value_counts()

In [None]:
df.dtypes

### Making vector embeddings

In [None]:
import pandas as pd
import numpy as np
import ast
from langchain_openai import OpenAIEmbeddings

# 1. SETUP: Load your data
# ------------------------------------------------------------------
# df = pd.read_csv("your_data.csv") # Uncomment to load your real file
# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# 2. CREATE UNIFIED TEXT REPRESENTATION
# ------------------------------------------------------------------
# Instead of training a separate Word2Vec model, we will format the metadata 
# into a structured string that the 8B model can "read" and understand semantically.
# This technique is often called "Text Serialization".

def serialize_row_for_embedding(row):
    # Parse tags safely
    try:
        tags = ast.literal_eval(row['tags']) if isinstance(row['tags'], str) else row['tags']
        tags_str = ", ".join(tags)
    except:
        tags_str = "None"
        
    # Create a rich text block that describes the entire data point
    # We put the most important semantic info (Category, Tags, Date) at the start or end.
    combined_text = (
        f"Category: {row['category']}. "
        f"Tags: {tags_str}. "
        f"Date: {row['date'].strftime('%Y-%m-%d')}. "
        f"Title: {row['title']}\n"
        f"Content: {row['body']}"
    )
    return combined_text

# Apply the function
df['serialized_text'] = df.apply(serialize_row_for_embedding, axis=1)

# 3. EMBED WITH LOCAL LLAMA MODEL (via OpenAI Compatible API)
# ------------------------------------------------------------------
# Assuming you are running the model in LM Studio / Ollama on port 1234
# Check your local server settings for the exact URL.

embedding_model = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1", # Point to your local server
    api_key="lm-studio",                 # Arbitrary key
    model="Qwen3-Embedding-4B-GGUF",     # The specific model name loaded in your server
    check_embedding_ctx_length=False     # Important for long texts
)

print("Starting embedding process... (This may take time depending on GPU)")

# We process in batches to be safe with memory
batch_size = 32
all_embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['serialized_text'].iloc[i:i+batch_size].tolist()
    
    # Generate embeddings for the batch
    # embed_documents returns a list of lists (vectors)
    batch_embeddings = embedding_model.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)
    
    print(f"Processed rows {i} to {min(i+batch_size, len(df))}")

# 4. STORE RESULTS
# ------------------------------------------------------------------
# Convert to numpy array for use in classifiers or Vector DB
final_features = np.array(all_embeddings)

print(f"Final Feature Matrix Shape: {final_features.shape}")

# Optional: Add back to DataFrame
df['embedding_vector'] = list(final_features)

In [None]:
import numpy as np

# 1. Save the DataFrame (Contains text, metadata, and vectors)
# Pickle is better than CSV because it preserves lists/arrays perfectly.
df.to_pickle("climate_news_data.pkl")

# 2. Save the Raw Numpy Array (Just in case)
# This is the safest way to store the pure mathematical vectors.
np.save("climate_vectors.npy", final_features)

print("Saved 'climate_news_data.pkl' and 'climate_vectors.npy' to disk.")

### Storing embeddings with Milvus

In [None]:
import pandas as pd
import numpy as np

print("üöÄ Attempting to rescue data...")

# 1. Load the Pickle
# Since we are on NumPy 2.x (installed above), this will read the file correctly.
df = pd.read_pickle("climate_news_data.pkl")
print(f"‚úÖ Pickle loaded successfully. Shape: {df.shape}")

# 2. Save as Parquet
# We drop the vector column if it exists to keep the file light (we have the .npy file)
if 'embedding_vector' in df.columns:
    df = df.drop(columns=['embedding_vector'])

df.to_parquet("climate_news_data.parquet")
print("‚úÖ SUCCESS: Data saved to 'climate_news_data.parquet'")

# 3. Verify Vector File
# This usually loads fine regardless of version, but let's check.
vectors = np.load("climate_vectors.npy")
print(f"‚úÖ SUCCESS: Vectors verified. Shape: {vectors.shape}")

In [None]:
import pandas as pd
import numpy as np
from pymilvus import MilvusClient, DataType

# 1. RELOAD YOUR SAVED DATA
# ------------------------------------------------------------------
print("üîÑ Reloading rescued data...")
df = pd.read_parquet("climate_news_data.parquet")
final_features = np.load("climate_vectors.npy")
print(f"‚úÖ Data Loaded. Articles: {len(df)} | Vector Dim: {final_features.shape[1]}")

# 2. SETUP MILVUS LITE
# ------------------------------------------------------------------
client = MilvusClient("./climate_news.db")
COLLECTION_NAME = "climate_articles"

# 3. DEFINE THE SCHEMA
# ------------------------------------------------------------------
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)

schema = client.create_schema(auto_id=True, enable_dynamic_field=True)

# Add Fields
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=2560) # Matches your Qwen size
schema.add_field(field_name="category", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="date", datatype=DataType.VARCHAR, max_length=50)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)

# 4. DEFINE INDEX
# ------------------------------------------------------------------
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector", 
    index_type="AUTOINDEX", 
    metric_type="COSINE"
)

# 5. CREATE COLLECTION
# ------------------------------------------------------------------
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params
)
print(f"‚úÖ Collection '{COLLECTION_NAME}' created.")

# 6. INSERT DATA
# ------------------------------------------------------------------
data_to_insert = []
print("Preparing data for insertion...")

for idx, row in df.iterrows():
    vector_list = final_features[idx].tolist()
    date_str = row['date'].strftime('%Y-%m-%d') if pd.notnull(row['date']) else ""
    
    entry = {
        "vector": vector_list,
        "text": str(row['body']),
        "title": str(row['title']),
        "category": str(row['category']),
        "date": date_str,
        "tags": str(row['tags'])
    }
    data_to_insert.append(entry)

# Insert in batches
batch_size = 100
total_inserted = 0

for i in range(0, len(data_to_insert), batch_size):
    batch = data_to_insert[i:i+batch_size]
    res = client.insert(collection_name=COLLECTION_NAME, data=batch)
    total_inserted += res['insert_count']
    print(f"Inserted batch {i} to {i+len(batch)}...")

print(f"‚úÖ SUCCESS! Stored {total_inserted} articles in 'climate_news.db'")

# RAG Framework

In [None]:
import os
from langchain_milvus import Milvus
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# A. The Embedding Model (Must be the exact same as the one used to create embeddings)
embeddings = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="Qwen3-Embedding-4B-GGUF",
    check_embedding_ctx_length=False
)

# B. The LLM 
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="mistralai/ministral-3-14b-reasoning",
    temperature=0.1
)

# --- 2. CONNECT TO YOUR EXISTING DATABASE ---

# Connect to the "climate_news.db" file
vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": "./climate_news.db"}, # Connects to Milvus Lite file
    collection_name="climate_articles",           # Must match the name you used
    text_field="text",                            # Tell LangChain which column contains the readable content
    auto_id=True
)

# Create the Retriever
# Adjust k based on the size of the model
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# PROMPT TEMPLATE 

# Tune the LLM responses
template = """You are a specialized Climate News Assistant.
Use the following pieces of retrieved context to answer the question.
If the answer is not in the context, say "I cannot find this information in the local news database."

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

prompt = PromptTemplate.from_template(template)

# BUILDING THE CHAIN

def format_docs(docs):
    # Helper to join the retrieved articles into one big string
    return "\n\n".join(doc.page_content for doc in docs)

# The RAG Chain:
# 1. Take question -> 2. Retrieve docs -> 3. Format them -> 4. Pass to Prompt -> 5. Pass to LLM
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Test the system
query = "Tell me any news about tiny snails on Atlantic Island"
response = rag_chain.invoke(query)

In [14]:
print(f"‚ùì Question: {query}\n")
print("ü§ñ Agent Answer:")
print(response)

‚ùì Question: Tell me any news about tiny snails on Atlantic Island

ü§ñ Agent Answer:
More than 1,300 critically endangered Desertas Island land snails have been released onto Bugio island, part of the Madeira archipelago in the Atlantic Ocean. These tiny snails had not been recorded for over 100 years and were believed to be extinct until their rediscovery in small populations during conservation expeditions between 2012 and 2017.

A breeding program involving zoos in the UK and France, including Chester zoo, successfully boosted their numbers. The snails were then released into a protected refuge on Bugio island, which has been off-limits to humans since 1990 to protect its fragile ecosystem from invasive predators like rats and mice.

Each reintroduced snail was individually marked for monitoring, and if successful, many more will be released in the future to further bolster their population. This conservation effort is significant because these snails are found only on the Desert