# Pip

In [None]:
# 1. Force upgrade the critical libraries
%pip install -U langchain langchain-core langchain-openai langchain-community pydantic

# 2. IMPORTANT: You must restart the kernel after running this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

In [None]:
%pip install sentence-transformers gensim datasets

In [None]:
%pip install --upgrade --force-reinstall datasets sentence-transformers

In [None]:
%pip install --upgrade --force-reinstall gensim numpy

In [None]:
%pip install -U sentence-transformers transformers flash-attn

In [None]:
%pip install pymilvus

In [2]:
# 1. Downgrade NumPy to the 1.x version (most compatible)
%pip install "numpy<2.0"

# 2. You MUST restart your kernel after this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

[0mCollecting numpy<2.0
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.7 MB[0m [31m?[0m eta [36m-:--:--[0m  Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: numpy
  Attempting uninstall: numpy
[0m    Found existing installation: numpy 2.3.5
Installing collected packages: numpy
  Attempting uninstall: numpy
[0m    Found existing installation: numpy 2.3.5
    Uninstalling numpy-2.3.5:
      Successfully uninstal

# Simple scraping agent

In [None]:
import os
import requests
import pandas as pd
from io import StringIO
from pydantic import BaseModel, Field
from langchain_classic.agents import AgentExecutor, create_react_agent
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

In [None]:
# Connected to LLM running locally
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="local-model",
    temperature=0,
    streaming=True
)

# Define the Tool
@tool
def fetch_csv_dataset(url: str) -> str:
    """
    Downloads a CSV dataset from a URL and returns a summary.
    Input should be the full URL string.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse CSV
        content = response.content.decode('utf-8')
        df = pd.read_csv(StringIO(content), on_bad_lines='skip')
        
        return (
            f"SUCCESS: Downloaded data from {url}\n"
            f"Shape: {df.shape}\n"
            f"Columns: {list(df.columns)}\n"
            f"First 5 rows:\n{df.head().to_string()}"
        )
    except Exception as e:
        return f"ERROR: {str(e)}"

tools = [fetch_csv_dataset]

# Define the ReAct Prompt (Hardcoded for stability)
# This teaches the model explicitly how to think and act.
template = '''Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}'''

prompt = PromptTemplate.from_template(template)

# 4. Create the ReAct Agent
# This uses simple text generation, avoiding the Pydantic/Tool Binding error completely.
agent = create_react_agent(llm, tools, prompt)

# 5. Create the Executor
agent_executor = AgentExecutor(
    agent=agent, 
    tools=tools, 
    verbose=True, 
    handle_parsing_errors=True # IMPORTANT for local models
)

print("✅ ReAct Agent built successfully.")

In [None]:
test_url = "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"
query = f"Download the dataset from {test_url} and tell me the columns."

response = agent_executor.invoke({"input": query})
print("\n--- FINAL ANSWER ---")
print(response['output'])

# Vector embedding of 2022-2024 news

### Load and filter data

In [None]:
import pandas as pd
df = pd.read_csv("dataset/guardian_climate_news_corpus.csv")

df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True).dt.tz_convert(None)
df = df[df['date'].dt.year >= 2022].copy()

df = df[df['label'] != 'UNRELATED_TO_CLIMATE'].copy()

df.reset_index(drop=True, inplace=True)
df

In [None]:
df["label"].value_counts()

In [None]:
df.dtypes

### Making vector embeddings

In [None]:
import pandas as pd
import numpy as np
import ast
from langchain_openai import OpenAIEmbeddings

# 1. SETUP: Load your data
# ------------------------------------------------------------------
# df = pd.read_csv("your_data.csv") # Uncomment to load your real file
# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# 2. CREATE UNIFIED TEXT REPRESENTATION
# ------------------------------------------------------------------
# Instead of training a separate Word2Vec model, we will format the metadata 
# into a structured string that the 8B model can "read" and understand semantically.
# This technique is often called "Text Serialization".

def serialize_row_for_embedding(row):
    # Parse tags safely
    try:
        tags = ast.literal_eval(row['tags']) if isinstance(row['tags'], str) else row['tags']
        tags_str = ", ".join(tags)
    except:
        tags_str = "None"
        
    # Create a rich text block that describes the entire data point
    # We put the most important semantic info (Category, Tags, Date) at the start or end.
    combined_text = (
        f"Category: {row['category']}. "
        f"Tags: {tags_str}. "
        f"Date: {row['date'].strftime('%Y-%m-%d')}. "
        f"Title: {row['title']}\n"
        f"Content: {row['body']}"
    )
    return combined_text

# Apply the function
df['serialized_text'] = df.apply(serialize_row_for_embedding, axis=1)

# 3. EMBED WITH LOCAL LLAMA MODEL (via OpenAI Compatible API)
# ------------------------------------------------------------------
# Assuming you are running the model in LM Studio / Ollama on port 1234
# Check your local server settings for the exact URL.

embedding_model = OpenAIEmbeddings(
    base_url="http://127.0.0.1:1234/v1", # Point to your local server
    api_key="lm-studio",                 # Arbitrary key
    model="Qwen3-Embedding-4B-GGUF",     # The specific model name loaded in your server
    check_embedding_ctx_length=False     # Important for long texts
)

print("Starting embedding process... (This may take time depending on GPU)")

# We process in batches to be safe with memory
batch_size = 32
all_embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['serialized_text'].iloc[i:i+batch_size].tolist()
    
    # Generate embeddings for the batch
    # embed_documents returns a list of lists (vectors)
    batch_embeddings = embedding_model.embed_documents(batch_texts)
    all_embeddings.extend(batch_embeddings)
    
    print(f"Processed rows {i} to {min(i+batch_size, len(df))}")

# 4. STORE RESULTS
# ------------------------------------------------------------------
# Convert to numpy array for use in classifiers or Vector DB
final_features = np.array(all_embeddings)

print(f"Final Feature Matrix Shape: {final_features.shape}")

# Optional: Add back to DataFrame
df['embedding_vector'] = list(final_features)

In [None]:
import numpy as np

# 1. Save the DataFrame (Contains text, metadata, and vectors)
# Pickle is better than CSV because it preserves lists/arrays perfectly.
df.to_pickle("climate_news_data.pkl")

# 2. Save the Raw Numpy Array (Just in case)
# This is the safest way to store the pure mathematical vectors.
np.save("climate_vectors.npy", final_features)

print("Saved 'climate_news_data.pkl' and 'climate_vectors.npy' to disk.")

### Storing embeddings with Milvus

In [1]:
import pandas as pd
import numpy as np

# 1. Load the Data
df = pd.read_pickle("climate_news_data.pkl")

# 2. Load the Vectors
final_features = np.load("climate_vectors.npy")

print(f"✅ Loaded Data. Shape: {df.shape}")
print(f"✅ Loaded Vectors. Shape: {final_features.shape}")

# Now you can proceed directly to the Milvus code!

ModuleNotFoundError: No module named 'numpy._core.numeric'

In [None]:
from pymilvus import MilvusClient, DataType
import numpy as np

# 1. SETUP MILVUS LITE
# ------------------------------------------------------------------
# This creates a local file named "climate_news.db" in your current folder.
# No server needed!
client = MilvusClient("./climate_news.db")

COLLECTION_NAME = "climate_articles"

# 2. DEFINE THE SCHEMA
# ------------------------------------------------------------------
# We need to tell Milvus exactly what our data looks like.
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME) # Reset if running multiple times

# Create schema
schema = client.create_schema(
    auto_id=True, # Milvus will create a unique ID for each article
    enable_dynamic_field=True # Allows storing extra columns without strict definitions
)

# Add the Primary Key
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)

# Add the Vector Field (CRITICAL: Dim must match your Qwen output: 2560)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=2560)

# Add Metadata Fields (Optimized for filtering)
schema.add_field(field_name="category", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="date", datatype=DataType.VARCHAR, max_length=50) # Storing as string YYYY-MM-DD is easier for basic filtering
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535) # The actual article content for RAG

# 3. DEFINE INDEX (The "Search Engine")
# ------------------------------------------------------------------
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="vector", 
    index_type="AUTOINDEX", # Milvus Lite optimizes this automatically
    metric_type="COSINE"    # Best for semantic similarity
)

# 4. CREATE THE COLLECTION
# ------------------------------------------------------------------
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params
)

print(f"Collection '{COLLECTION_NAME}' created successfully.")

# 5. PREPARE DATA FOR INSERTION
# ------------------------------------------------------------------
# We need to convert your DataFrame + Numpy Array into a list of dictionaries
data_to_insert = []

print("Preparing data for insertion...")

for idx, row in df.iterrows():
    # Convert numpy vector to standard list for JSON serialization
    vector_list = final_features[idx].tolist()
    
    # Format date safely
    date_str = row['date'].strftime('%Y-%m-%d') if pd.notnull(row['date']) else ""
    
    entry = {
        "vector": vector_list,
        "text": str(row['body']),      # The main content the Agent will read
        "title": str(row['title']),    # Useful context
        "category": str(row['category']), # For filtering
        "date": date_str,              # For filtering
        "tags": str(row['tags'])       # Storing tags as string for simple retrieval
    }
    data_to_insert.append(entry)

# 6. INSERT DATA
# ------------------------------------------------------------------
# Insert in batches to be safe with memory
batch_size = 100
total_inserted = 0

for i in range(0, len(data_to_insert), batch_size):
    batch = data_to_insert[i:i+batch_size]
    res = client.insert(
        collection_name=COLLECTION_NAME,
        data=batch
    )
    total_inserted += res['insert_count']
    print(f"Inserted batch {i} to {i+len(batch)}...")

print(f"✅ DONE! Successfully stored {total_inserted} articles in 'climate_news.db'")