# Turning Text into a Knowledge Graph with Python
This exercise shows how to transform raw text into a simple **Knowledge Graph** using Python, an LLM (e.g., OpenAI GPT), and visualization tools.

Steps:
1. Install dependencies  
2. Import libraries  
3. Provide text input  
4. Extract entities & relationships with an LLM  
5. Build the graph  
6. Visualize the graph  

In [1]:
#!pip install --upgrade langchain langchain-experimental langchain-openai python-dotenv pyvis
!pip install spacy networkx pyvis jinja2
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.5 

In [15]:
# Install required libraries
import spacy
import networkx as nx
import json
import os

In [16]:

from pyvis.network import Network
from IPython.display import display, HTML, IFrame
from IPython.display import IFrame

In [17]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def aconvert_to_graph_documents(text):
    """
    Convert a text into a graph document (entities + relations).
    Returns:
      - graph (NetworkX DiGraph)
      - triples (list of extracted (subject, relation, object))
    """
    doc = nlp(text)
    graph = nx.DiGraph()
    triples = []

    for sent in doc.sents:
        subj = None
        obj = None
        rel = None

        for token in sent:
            # Identify subject
            if "subj" in token.dep_:
                subj = token.text
                rel = token.head.lemma_  # use the verb governing the subject

            # Identify object
            if "obj" in token.dep_:
                obj = token.text
                rel = token.head.lemma_  # use the verb governing the object

        if subj and obj and rel:
            triples.append((subj, rel, obj))
            graph.add_node(subj)
            graph.add_node(obj)
            graph.add_edge(subj, obj, relation=rel)

    return graph, triples


In [18]:
# Build the graph from paragraph
text = """ Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics. He also made significant contributions to the development of quantum mechanics, particularly through his explanation of the photoelectric effect. Later in life, Einstein became a professor at Princeton University and was affiliated with the Institute for Advanced Study in Princeton. His work has influenced countless scientists and shaped our understanding of space, time, and energy."""
graph, triples = aconvert_to_graph_documents(text)
# 5. Process text with spaCy
# -----------------------------
doc = nlp(text)

In [19]:
# Extract subject-relation-object triples OPTION #2
# -----------------------------
triples = []
for sent in doc.sents:
    subject = None
    obj = None
    relation = None

    # Use noun chunks for multi-word subjects/objects
    for chunk in sent.noun_chunks:
        if "subj" in chunk.root.dep_:
            subject = chunk.text
        elif "obj" in chunk.root.dep_:
            obj = chunk.text

    # Find main verb as relation
    for token in sent:
        if token.pos_ == "VERB":
            relation = token.lemma_

    if subject and obj and relation:
        triples.append((subject, relation, obj))

In [20]:

# Verify extracted triples
# -----------------------------
print("Extracted triples:")
for t in triples:
    print(t)
    
# Verify nodes and edges
# -----------------------------
print("\nGraph nodes:", list(graph.nodes()))
print("Graph edges with relations:")
for u, v, d in graph.edges(data=True):
    print(f"{u} -[{d['relation']}]-> {v}")


Extracted triples:
('who', 'develop', 'modern physics')
('He', 'make', 'the photoelectric effect')
('Einstein', 'affiliate', 'Princeton')
('His work', 'shape', 'space')

Graph nodes: ['who', 'physics', 'He', 'effect', 'Einstein', 'Princeton', 'work', 'space']
Graph edges with relations:
who -[of]-> physics
He -[of]-> effect
Einstein -[in]-> Princeton
work -[of]-> space


In [21]:
# Build directed graph OPTION #2
# -----------------------------
G = nx.DiGraph()
for subj, rel, obj in triples:
    G.add_node(subj)
    G.add_node(obj)
    G.add_edge(subj, obj, label=rel)

In [22]:
# Create PyVis interactive network OPTION #2
# -----------------------------
net = Network(notebook=True, height="600px", width="1000px", directed=True, cdn_resources='in_line')
for node in G.nodes:
    net.add_node(node, label=node, title=node)
for u, v, d in G.edges(data=True):
    net.add_edge(u, v, label=d['label'], title=d['label'])


In [23]:
# Add nodes from the graph
for node in graph.nodes:
    net.add_node(node, label=node, title=node)

# Add edges from the graph with proper labels
for u, v, d in graph.edges(data=True):
    # Use 'relation' key if present, else fallback to 'label'
    edge_label = d.get('relation', d.get('label', ''))
    net.add_edge(u, v, label=edge_label, title=edge_label)

In [24]:
# Render interactive graph inline
# -----------------------------
html_file = "knowledge_graph.html"
with open(html_file, "w", encoding="utf-8") as f:
    f.write(net.generate_html())

display(IFrame(html_file, width=900, height=600))

In [14]:
# Print triples for reference
# -----------------------------
print("Extracted triples:")
for t in triples:
    print(t)

Extracted triples:
('who', 'develop', 'modern physics')
('He', 'make', 'the photoelectric effect')
('Einstein', 'affiliate', 'Princeton')
('His work', 'shape', 'space')
