In [1]:
# Cell 0 – Install required libraries (run once)

%pip install requests python-dotenv pandas openai py2neo

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


### Import

In [1]:
# Cell 1 – Imports & load environment variables

import os
import json
import requests
import pandas as pd
from datetime import datetime, timedelta

from dotenv import load_dotenv
from py2neo import Graph, Node, Relationship

from openai import OpenAI

# Load keys from .env
load_dotenv()

NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

client = OpenAI(api_key=OPENAI_API_KEY)

print("NewsAPI key loaded:", NEWSAPI_KEY is not None)
print("OpenAI key loaded:", OPENAI_API_KEY is not None)
print("Neo4j URI:", NEO4J_URI)

NewsAPI key loaded: True
OpenAI key loaded: True
Neo4j URI: bolt://localhost:7687


### Download

In [2]:
# Cell 2 – Fetch financial news using NewsAPI

def fetch_financial_news(query="Amazon stock", page_size=20, days_back=3):
    base_url = "https://newsapi.org/v2/everything"
    from_date = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")
    
    params = {
        "q": query,
        "language": "en",
        "from": from_date,
        "sortBy": "relevancy",
        "pageSize": page_size,
        "apiKey": NEWSAPI_KEY,
    }
    
    resp = requests.get(base_url, params=params)
    data = resp.json()
    
    if data.get("status") != "ok":
        print("Error:", data)
        return pd.DataFrame()
    
    articles = data.get("articles", [])
    rows = []
    for i, art in enumerate(articles):
        rows.append({
            "id": i,
            "source": art["source"]["name"],
            "author": art.get("author"),
            "title": art.get("title"),
            "description": art.get("description"),
            "content": art.get("content"),
            "url": art.get("url"),
            "published_at": art.get("publishedAt"),
        })
    
    return pd.DataFrame(rows)

df_news = fetch_financial_news(query="Amazon finance", page_size=15, days_back=7)
df_news.head()

  from_date = (datetime.utcnow() - timedelta(days=days_back)).strftime("%Y-%m-%d")


Unnamed: 0,id,source,author,title,description,content,url,published_at
0,0,Yahoo Entertainment,By Brad Haynes,Exclusive-Google deal for Amazon reforestation...,"BELEM, Brazil (Reuters) -Google has struck its...","By Brad Haynes\r\nBELEM, Brazil (Reuters) -Goo...",https://finance.yahoo.com/news/exclusive-googl...,2025-11-06T10:01:59Z
1,1,Gizmodo.com,Rhett Jones,$120 Million Exploit Has Chilling Effect on En...,The incident casts a dark shadow on the trustw...,"Balancer, which is a decentralized finance (De...",https://gizmodo.com/120-million-exploit-has-ch...,2025-11-03T20:00:33Z
2,2,The Verge,Nilay Patel,Lyft CEO David Risher on paying drivers more a...,"Today, I’m talking with David Risher, who is t...",<ul><li></li><li></li><li></li></ul>\r\nRisher...,https://www.theverge.com/podcast/811532/lyft-u...,2025-11-03T15:02:30Z
3,3,BBC News,,COP30: World leaders take aim at Trump for cli...,World leaders address COP30 climate summit in ...,"Esme Stallard,Climate and science reporter, BB...",https://www.bbc.com/news/articles/cn4j8dgnj1wo,2025-11-06T18:49:14Z
4,4,New Scientist,Luke Taylor,COP30: Can Brazil summit get climate negotiati...,Expectations are low for the UN climate confer...,"A preparatory ministerial meeting in Brasilia,...",https://www.newscientist.com/article/2502430-c...,2025-11-04T14:00:40Z


### Text cleaning

In [3]:
# Cell 3 – Basic preprocessing: combine text and clean

import re

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r"\s+", " ", text)      # collapse whitespace
    text = text.replace("…", "...")
    return text.strip()

df_news["raw_text"] = (
    df_news["title"].fillna("") + ". " +
    df_news["description"].fillna("") + ". " +
    df_news["content"].fillna("")
)

df_news["clean_text"] = df_news["raw_text"].apply(clean_text)

print("Number of articles:", len(df_news))
df_news[["id", "source", "title", "clean_text"]].head()

Number of articles: 15


Unnamed: 0,id,source,title,clean_text
0,0,Yahoo Entertainment,Exclusive-Google deal for Amazon reforestation...,Exclusive-Google deal for Amazon reforestation...
1,1,Gizmodo.com,$120 Million Exploit Has Chilling Effect on En...,$120 Million Exploit Has Chilling Effect on En...
2,2,The Verge,Lyft CEO David Risher on paying drivers more a...,Lyft CEO David Risher on paying drivers more a...
3,3,BBC News,COP30: World leaders take aim at Trump for cli...,COP30: World leaders take aim at Trump for cli...
4,4,New Scientist,COP30: Can Brazil summit get climate negotiati...,COP30: Can Brazil summit get climate negotiati...


### Prompt

In [4]:
# Cell 4 – LLM helper to extract entities and relationships

SYSTEM_PROMPT = """
You are an information extraction assistant specialized in financial news.
Given a piece of text, extract key entities and relationships as triplets.

Only output valid JSON with the following format:
[
  {
    "head": "Amazon",
    "head_type": "Company",
    "relation": "acquires",
    "tail": "Whole Foods",
    "tail_type": "Company",
    "evidence": "short quote from the text"
  },
  ...
]

Entity types can include: Company, Person, Product, Market, Index, Currency, Metric, Date, Event, Other.
Relation examples: acquires, invests_in, partners_with, sues, fined_by, appoints, reports_result, impacts, located_in, etc.
"""

def extract_triples_llm(text, max_chars=2000):
    # optionally truncate long text
    text = text[:max_chars]
    
    user_prompt = f"""
Extract financial knowledge graph triplets from the following news text.

Text:
\"\"\"{text}\"\"\"

Remember: only output a JSON array of objects with keys
[head, head_type, relation, tail, tail_type, evidence].
"""

    resp = client.responses.create(
        model="gpt-4.1",  # or another model you have
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    content = resp.output[0].content[0].text
    # Try to parse JSON
    try:
        triples = json.loads(content)
    except json.JSONDecodeError:
        print("JSON parse error, raw content:\n", content)
        triples = []
    return triples

### Run extraction

In [5]:
# Cell 5 – Run LLM extraction on first N articles

N = min(5, len(df_news))   # change as you like
all_triples = []

for _, row in df_news.head(N).iterrows():
    article_id = row["id"]
    text = row["clean_text"]
    print(f"\n=== Extracting from article {article_id}: {row['title']} ===")
    
    triples = extract_triples_llm(text)
    for t in triples:
        t["article_id"] = int(article_id)
        t["source"] = row["source"]
        t["published_at"] = row["published_at"]
        all_triples.append(t)

len(all_triples), all_triples[:3]


=== Extracting from article 0: Exclusive-Google deal for Amazon reforestation makes Brazilian startup its top carbon credit supplier ===

=== Extracting from article 1: $120 Million Exploit Has Chilling Effect on Entire Crypto Ecosystem ===

=== Extracting from article 2: Lyft CEO David Risher on paying drivers more and the shift to robotaxis ===

=== Extracting from article 3: COP30: World leaders take aim at Trump for climate inaction ===

=== Extracting from article 4: COP30: Can Brazil summit get climate negotiations back on track? ===


(17,
 [{'head': 'Google',
   'head_type': 'Company',
   'relation': 'finance',
   'tail': 'restoration of the Amazon rainforest',
   'tail_type': 'Event',
   'evidence': 'Google has struck its biggest carbon removal deal, agreeing to finance restoration of the Amazon rainforest',
   'article_id': 0,
   'source': 'Yahoo Entertainment',
   'published_at': '2025-11-06T10:01:59Z'},
  {'head': 'Google',
   'head_type': 'Company',
   'relation': 'partners_with',
   'tail': 'Mombak',
   'tail_type': 'Company',
   'evidence': 'Google has struck its biggest carbon removal deal, agreeing to finance restoration of the Amazon rainforest with Brazilian startup Mombak',
   'article_id': 0,
   'source': 'Yahoo Entertainment',
   'published_at': '2025-11-06T10:01:59Z'},
  {'head': 'Mombak',
   'head_type': 'Company',
   'relation': 'supplies',
   'tail': 'carbon credits',
   'tail_type': 'Product',
   'evidence': 'Brazilian startup its top carbon credit supplier',
   'article_id': 0,
   'source': 'Yah

### Data cleaning and Triplet Normalization

In [6]:
# Cell 6 – Clean and standardize triplets

triples_df = pd.DataFrame(all_triples)

def normalize_name(x):
    if not isinstance(x, str):
        return ""
    x = x.strip()
    # simple normalization – you can add more
    return x

for col in ["head", "tail", "relation", "head_type", "tail_type"]:
    triples_df[col] = triples_df[col].astype(str).apply(normalize_name)

# Remove empty head or tail
triples_df = triples_df[(triples_df["head"] != "") & (triples_df["tail"] != "")]

# Drop duplicates
triples_df = triples_df.drop_duplicates(
    subset=["head", "relation", "tail", "article_id"]
).reset_index(drop=True)

print("Cleaned triples:", len(triples_df))
triples_df.head()

Cleaned triples: 17


Unnamed: 0,head,head_type,relation,tail,tail_type,evidence,article_id,source,published_at
0,Google,Company,finance,restoration of the Amazon rainforest,Event,Google has struck its biggest carbon removal d...,0,Yahoo Entertainment,2025-11-06T10:01:59Z
1,Google,Company,partners_with,Mombak,Company,Google has struck its biggest carbon removal d...,0,Yahoo Entertainment,2025-11-06T10:01:59Z
2,Mombak,Company,supplies,carbon credits,Product,Brazilian startup its top carbon credit supplier,0,Yahoo Entertainment,2025-11-06T10:01:59Z
3,Balancer,Company,exploited_in,$120 Million Exploit,Event,"Balancer, which is a decentralized finance (De...",1,Gizmodo.com,2025-11-03T20:00:33Z
4,$120 Million Exploit,Event,impacts,crypto ecosystem,Market,$120 Million Exploit Has Chilling Effect on En...,1,Gizmodo.com,2025-11-03T20:00:33Z


### Connect to Neo4j

In [8]:
# Cell 7 – Connect to Neo4j

graph = Graph(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# Optional: test query
graph.run("RETURN 'Neo4j connection OK' AS msg").to_table()

msg
Neo4j connection OK


### Graph construct

In [9]:
# Cell 8 – Create nodes and relationships in Neo4j

def upsert_triplet_to_neo4j(row):
    head_name = row["head"]
    tail_name = row["tail"]
    rel = row["relation"]
    head_type = row.get("head_type", "Entity")
    tail_type = row.get("tail_type", "Entity")
    evidence = row.get("evidence", "")
    source = row.get("source", "")
    published_at = row.get("published_at", "")
    
    cypher = """
    MERGE (h:Entity {name: $head_name})
      ON CREATE SET h.type = $head_type
      ON MATCH SET  h.type = coalesce(h.type, $head_type)
    MERGE (t:Entity {name: $tail_name})
      ON CREATE SET t.type = $tail_type
      ON MATCH SET  t.type = coalesce(t.type, $tail_type)
    MERGE (h)-[r:RELATION {name: $rel}]->(t)
      ON CREATE SET r.evidence = $evidence,
                    r.source = $source,
                    r.published_at = $published_at
    """
    
    graph.run(
        cypher,
        head_name=head_name,
        tail_name=tail_name,
        rel=rel,
        head_type=head_type,
        tail_type=tail_type,
        evidence=evidence,
        source=source,
        published_at=published_at,
    )

for _, r in triples_df.iterrows():
    upsert_triplet_to_neo4j(r)

print("Inserted into Neo4j!")

Inserted into Neo4j!


In [10]:
# Cell 9 – Natural Language to Cypher query

NL2CYPHER_SYSTEM = """
You are an assistant that converts natural language questions
about a financial knowledge graph into Cypher queries for Neo4j.

The graph schema:
- Nodes: (:Entity {name, type})
- Relationships: (:Entity)-[r:RELATION {name, evidence, source, published_at}]->(:Entity)

Return only the Cypher query as plain text.
"""

def nl_to_cypher(question: str) -> str:
    user_prompt = f"""
User question:
\"\"\"{question}\"\"\"

Convert this to a Cypher query for Neo4j. Only output the Cypher query.
"""
    resp = client.responses.create(
        model="gpt-4.1-mini",
        input=[
            {"role": "system", "content": NL2CYPHER_SYSTEM},
            {"role": "user", "content": user_prompt},
        ],
    )
    cypher = resp.output[0].content[0].text.strip()
    return cypher

def ask_graph(question: str):
    cypher = nl_to_cypher(question)
    print("Generated Cypher:\n", cypher)
    try:
        result = graph.run(cypher)
        return result.to_data_frame()
    except Exception as e:
        print("Error running Cypher:", e)
        return None

# Example:
df_result = ask_graph("Show all companies that Amazon has acquired or invested in.")
df_result

Generated Cypher:
 MATCH (amazon:Entity {name: "Amazon", type: "Company"})-[r:RELATION]->(company:Entity)
WHERE r.name IN ["acquired", "invested in"] AND company.type = "Company"
RETURN company.name, r.name, r.evidence, r.source, r.published_at
