In [15]:
!pip install gradio spacy rdflib PyPDF2 pytesseract fitz
!python -m spacy download en_core_web_sm
!pip install gradio spacy rdflib PyMuPDF networkx matplotlib
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation success

In [17]:
import gradio as gr
import fitz  # PyMuPDF
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Function for reading PDF and extracting text
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text.strip()

# Function for extracting entities and relationships
def extract_entities_and_relationships(text):
    doc = nlp(text)

    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Extract relationships
    relationships = []
    for token in doc:
        if token.dep_ in ["nsubj", "dobj", "prep"] and token.head.pos_ == "VERB":
            relationships.append((token.head.text, token.text))

    return entities, relationships

# Function to create a knowledge graph
def create_knowledge_graph(entities, relationships):
    graph = Graph()
    EX = Namespace("http://example.org/")
    graph.bind("ex", EX)

    entity_uris = {}

    # Add entities to the graph
    for entity, label in entities:
        entity_uri = URIRef(EX[entity.replace(" ", "_")])
        entity_uris[entity] = entity_uri
        graph.add((entity_uri, RDF.type, RDFS.Class))
        graph.add((entity_uri, RDFS.label, Literal(label)))

    # Add relationships to the graph
    for subj, obj in relationships:
        if subj in entity_uris and obj in entity_uris:
            graph.add((entity_uris[subj], RDFS.seeAlso, entity_uris[obj]))

    return graph

# Function to visualize the graph using NetworkX
def visualize_graph(entities, relationships):
    g = nx.DiGraph()

    # Add entities as nodes
    for entity, label in entities:
        g.add_node(entity, label=label)

    # Add relationships as edges
    for subj, obj in relationships:
        g.add_edge(subj, obj)

    # Draw the graph
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(g)
    nx.draw_networkx_nodes(g, pos, node_size=3000, node_color="lightblue", alpha=0.8)
    nx.draw_networkx_edges(g, pos, edge_color="gray", arrows=True)
    nx.draw_networkx_labels(g, pos, font_size=10, font_weight="bold")
    plt.title("Knowledge Graph Visualization", fontsize=14)
    plt.axis("off")

    # Save the visualization as an image
    plt.savefig("knowledge_graph.png")
    plt.show()

# Process document to extract and display results
def process_document(pdf_file):
    text = extract_text_from_pdf(pdf_file.name)
    entities, relationships = extract_entities_and_relationships(text)
    graph = create_knowledge_graph(entities, relationships)

    # Visualize the graph
    visualize_graph(entities, relationships)

    # Serialize the knowledge graph to Turtle format
    turtle_data = graph.serialize(format="turtle")

    return {
        "Extracted Text": text,
        "Entities": entities,
        "Relationships": relationships,
        "Knowledge Graph (Turtle Format)": turtle_data
    }

# Gradio Interface
def gradio_interface():
    with gr.Blocks() as interface:
        gr.Markdown("# Automated Knowledge Graph Builder")
        gr.Markdown("Upload a structured document to extract entities, relationships, and generate a knowledge graph.")

        pdf_file = gr.File(label="Upload PDF")
        result = gr.JSON(label="Results")

        process_button = gr.Button("Process Document")
        process_button.click(fn=process_document, inputs=pdf_file, outputs=result)

    return interface

interface = gradio_interface()
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dfe2134a301357a372.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


