In [38]:
# 1. Force upgrade the critical libraries
%pip install -U langchain langchain-core langchain-openai langchain-community pydantic

# 2. IMPORTANT: You must restart the kernel after running this!
# In VS Code/Jupyter: Click "Restart" or "Restart Kernel" in the top toolbar.

Collecting langchain-core
  Downloading langchain_core-1.2.0-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-core
  Downloading langchain_core-1.2.0-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.3-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.3-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_core-1.2.0-py3-none-any.whl (475 kB)
Downloading langchain_core-1.2.0-py3-none-any.whl (475 kB)
Downloading langchain_openai-1.1.3-py3-none-any.whl (84 kB)
Downloading langchain_openai-1.1.3-py3-none-any.whl (84 kB)
Installing collected packages: langchain-core, langchain-openai
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 1.1.3
    Uninstalling langchain-core-1.1.3:
      Successfully uninstalled langchain-core-1.1.3
Installing collected packages: langchain-core, langchain-openai
  Attempting uninstall: langchain-core
    Found existing 

In [83]:
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<6.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<6.0.0,>=4.41.0->sentence-transformer

In [51]:
import os
import requests
import pandas as pd
from io import StringIO
from pydantic import BaseModel, Field
from langchain_classic.agents import AgentExecutor, create_react_agent
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate

# Connected to LLM running locally
llm = ChatOpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="lm-studio",
    model="local-model",
    temperature=0,
    streaming=True
)

# Define the Tool
@tool
def fetch_csv_dataset(url: str) -> str:
    """
    Downloads a CSV dataset from a URL and returns a summary.
    Input should be the full URL string.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse CSV
        content = response.content.decode('utf-8')
        df = pd.read_csv(StringIO(content), on_bad_lines='skip')
        
        return (
            f"SUCCESS: Downloaded data from {url}\n"
            f"Shape: {df.shape}\n"
            f"Columns: {list(df.columns)}\n"
            f"First 5 rows:\n{df.head().to_string()}"
        )
    except Exception as e:
        return f"ERROR: {str(e)}"

tools = [fetch_csv_dataset]

# Define the ReAct Prompt (Hardcoded for stability)
# This teaches the model explicitly how to think and act.
template = '''Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}'''

prompt = PromptTemplate.from_template(template)

# 4. Create the ReAct Agent
# This uses simple text generation, avoiding the Pydantic/Tool Binding error completely.
agent = create_react_agent(llm, tools, prompt)

# 5. Create the Executor
agent_executor = AgentExecutor(
    agent=agent, 
    tools=tools, 
    verbose=True, 
    handle_parsing_errors=True # IMPORTANT for local models
)

print("✅ ReAct Agent built successfully.")

✅ ReAct Agent built successfully.


In [46]:
test_url = "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"
query = f"Download the dataset from {test_url} and tell me the columns."

response = agent_executor.invoke({"input": query})
print("\n--- FINAL ANSWER ---")
print(response['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: fetch_csv_dataset
Action Input: "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"[0m[32;1m[1;3mAction: fetch_csv_dataset
Action Input: "https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv"[0m[36;1m[1;3mSUCCESS: Downloaded data from https://raw.githubusercontent.com/gramener/datasets/refs/heads/main/card_transactions.csv
Shape: (10000, 21)
Columns: ['ID', 'Transaction Time', 'Transaction Type', 'Channel', 'Is 3DS', 'Is Token', 'Decision', 'Decline Reason Code', 'Issuer Name', 'Issuer Country', 'Issuer Region', 'Funding source', 'Acquirer Name', 'Acquirer Country', 'Acquirer Region', 'Jurisdiction', 'Is Dispute', 'Is Fraud', 'Dispute Type', 'Amount', 'Industry Segment']
First 5 rows:
                ID          Transaction Time Transaction Type Channel Is 3DS Is Token  Decision Decline Reason Code                             Issuer Nam

# Vector embedding of 2022-2024 news

In [79]:
df = pd.read_csv("datasets/guardian_climate_news_corpus.csv")

df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True).dt.tz_convert(None)
df = df[df['date'].dt.year >= 2022].copy()

df = df[df['label'] != 'UNRELATED_TO_CLIMATE'].copy()

df.reset_index(drop=True, inplace=True)
df

  df['date'] = pd.to_datetime(df['date'], errors="coerce", utc=True).dt.tz_convert(None)


Unnamed: 0,id,title,body,tags,extracted_from_tag,category,date,use,label
0,environment/2024/dec/31/country-diary-a-hogman...,Country diary: A Hogmanay fire to see in the n...,It’s a few hundred metres from where the crab ...,"['environment/series/country-diary', 'environm...",environment/forests,BIODIVERSITY,2024-12-31,True,BIODIVERSITY
1,us-news/2024/dec/31/arbor-day-foundation-trees...,10m trees to be planted in US to replace ones ...,Some costs of the recently ended supercharged ...,"['us-news/us-news', 'environment/forests', 'us...",environment/forests,BIODIVERSITY,2024-12-31,True,BIODIVERSITY
2,environment/commentisfree/2024/dec/31/cicada-c...,A cicada: ‘What cicadas leave behind is a kind...,"Of all the languages’ words for cicada, Croati...","['environment/series/the-nature-of', 'commenti...",environment/wildlife,BIODIVERSITY,2024-12-30,True,BIODIVERSITY
3,environment/2024/dec/30/im-obsessed-with-broke...,I’m obsessed with broken shells: they are mark...,I have collected hundreds and hundreds of brok...,"['environment/series/why-i-m-obsessed-with', '...",environment/series/seascape-the-state-of-our-o...,BIODIVERSITY,2024-12-30,True,BIODIVERSITY
4,environment/2024/dec/30/2024s-most-costly-clim...,"2024’s most costly climate disasters killed 2,...",The world’s 10 most costly climate disasters o...,"['environment/climate-crisis', 'world/world', ...",us-news/hurricane-helene,EXTREME_CLIMATE_IMPACTS,2024-12-30,True,EXTREME_CLIMATE_IMPACTS
...,...,...,...,...,...,...,...,...,...
7757,environment/2022/jan/03/country-diary-on-this-...,"Country diary: On this hazy morning, the dista...","Ahead of me at the far end of the path, just w...","['environment/series/country-diary', 'uk/rural...",environment/forests,BIODIVERSITY,2022-01-03,True,BIODIVERSITY
7758,commentisfree/2022/jan/03/as-i-bum-shuffled-my...,As I bum-shuffled my way down the scree at Ava...,Nothing beats the New Zealand bush. The writer...,"['commentisfree/series/my-wild-place', 'world/...",environment/forests,BIODIVERSITY,2022-01-02,True,BIODIVERSITY
7759,culture/2022/jan/02/villagers-fight-to-keep-bb...,Villagers fight to keep BBC Victorian Farm in ...,Of the handful of historic working farms in th...,"['culture/museums', 'environment/farming', 'ed...",environment/farming,BIODIVERSITY,2022-01-02,True,BIODIVERSITY
7760,us-news/2022/jan/01/colorado-wildfires-destroy...,‘We lost everything’: Colorado wildfire destro...,Just as the flakes of the season’s first winte...,"['us-news/colorado', 'world/wildfires', 'us-ne...",world/wildfires,EXTREME_CLIMATE_IMPACTS,2022-01-01,True,EXTREME_CLIMATE_IMPACTS


In [80]:
df["label"].value_counts()

label
BIODIVERSITY               2575
EXTREME_CLIMATE_IMPACTS    1541
ENERGY                     1536
POLLUTION_AND_WASTE         979
EMISSIONS                   525
CLIMATE_ACTIVISM            284
CLIMATE_POLICY              269
GLOBAL_CRISIS                38
CLIMATE_DENIAL               15
Name: count, dtype: int64

In [82]:
df.dtypes

id                            object
title                         object
body                          object
tags                          object
extracted_from_tag            object
category                      object
date                  datetime64[ns]
use                             bool
label                         object
dtype: object

In [None]:
import numpy as np
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

# ==========================================
# PART 1: Word2Vec on Metadata (Tags, Date, Category)
# ==========================================

# A. Preprocess: Create a "sentence" for each row
# We treat the date as a string token so Word2Vec learns its relationship to other words.
def create_metadata_sentence(row):
    # 1. Convert date to string token (e.g., "DATE_2023-01-15")
    date_token = f"DATE_{row['date'].strftime('%Y-%m-%d')}"
    
    # 2. Clean category (e.g., "CAT_Environment")
    cat_token = f"CAT_{str(row['category']).replace(' ', '_')}"
    
    # 3. Process tags: split by comma if it's a string
    if isinstance(row['tags'], str):
        tag_tokens = [t.strip().replace(' ', '_') for t in row['tags'].split(',')]
    else:
        tag_tokens = []
        
    # Combine all into one list of tokens
    return [date_token, cat_token] + tag_tokens

# Apply to DataFrame
df['metadata_tokens'] = df.apply(create_metadata_sentence, axis=1)

# B. Train Word2Vec
# min_count=1 ensures even unique dates/tags are embedded (crucial for small datasets)
# vector_size=50 is standard for metadata; use 100+ for very large datasets
w2v_model = Word2Vec(sentences=df['metadata_tokens'], vector_size=50, window=5, min_count=1, workers=4)

# C. Generate the Vectors
# We average the vectors of all tokens in the row to get a single vector per row
def get_mean_w2v(tokens, model):
    valid_tokens = [t for t in tokens if t in model.wv]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[t] for t in valid_tokens], axis=0)

df['w2v_features'] = df['metadata_tokens'].apply(lambda x: get_mean_w2v(x, w2v_model))


# ==========================================
# PART 2: BERT on Text (Title + Body)
# ==========================================

# A. Load a pre-trained model
# 'all-MiniLM-L6-v2' is a fast, high-quality model for semantic similarity
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# B. Combine Title and Body
# It is usually better to embed them together so BERT understands the title in context of the body
df['combined_text'] = "Title: " + df['title'].astype(str) + " \nBody: " + df['body'].astype(str)

# C. Encode
# This returns a numpy array of embeddings (e.g., shape 384)
print("Encoding text with BERT... this may take a moment.")
embeddings = bert_model.encode(df['combined_text'].tolist(), show_progress_bar=True)

# Store in DataFrame
df['bert_features'] = list(embeddings)


# ==========================================
# PART 3: Final Output
# ==========================================

# You now have two vector columns. You can concatenate them for machine learning.
# Example: Creating a single X matrix
X_w2v = np.vstack(df['w2v_features'])
X_bert = np.vstack(df['bert_features'])

# Concatenate horizontally
X_final = np.hstack([X_w2v, X_bert])

print(f"Final Feature Matrix Shape: {X_final.shape}")
# Result: (rows, 50 + 384) -> (rows, 434)