In [2]:
import networkx as nx
from pyvis.network import Network
import fitz  # PyMuPDF
import os
import re
import sys
import json 


def extract_structured_text_from_pdf(pdf_path):
    """
    Extract text from PDF with formatting information using PyMuPDF.
    Uses geometric analysis to detect underlined text.
    Returns a list of dictionaries with text and its formatting properties.
    """
    try:
        # print(f"Opening PDF file: {pdf_path}")
        doc = fitz.open(pdf_path)
        # print(f"PDF has {len(doc)} pages")
        
        structured_text = []
        
        # First pass: determine maximum font size for reference
        max_font_size = 0
        for page in doc:
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            max_font_size = max(max_font_size, span["size"])
        # print(f"Maximum font size detected: {max_font_size}")
        
        # Process each page: extract text spans and detect underlines via drawing objects.
        for page_num, page in enumerate(doc):
            # Get drawing objects and filter for those that appear to be underlines.
            drawings = page.get_drawings()
            underline_rects = []
            for d in drawings:
                # Look for filled rectangles (type 'f') which might be drawn as underlines.
                if d.get("type") == "f":
                    for item in d.get("items", []):
                        if item[0] == "re":
                            rect = item[1]
                            # Heuristic: if the rectangle is very short in height, consider it an underline.
                            if rect.height < 5:
                                underline_rects.append(rect)
            
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        line_text = ""
                        is_bold = False
                        is_underlined = False
                        font_size = 0
                        font_used = ""
                        span_bboxes = []
                        
                        # Process each span within the line.
                        for span in line["spans"]:
                            if "bold" in span["font"].lower():
                                is_bold = True
                            line_text += span["text"] + " "
                            font_size = max(font_size, span["size"])
                            font_used = span["font"]
                            span_bboxes.append(span["bbox"])
                        
                        # Compute the union of the bounding boxes for the whole line.
                        if span_bboxes:
                            x0 = min(b[0] for b in span_bboxes)
                            y0 = min(b[1] for b in span_bboxes)
                            x1 = max(b[2] for b in span_bboxes)
                            y1 = max(b[3] for b in span_bboxes)
                            line_bbox = (x0, y0, x1, y1)
                        else:
                            line_bbox = None
                        
                        # Heuristic: if any underline rectangle overlaps horizontally
                        # and its top is within 5 units of the text bbox bottom, mark as underlined.
                        if line_bbox:
                            for rect in underline_rects:
                                overlap = min(line_bbox[2], rect.x1) - max(line_bbox[0], rect.x0)
                                if overlap > 0 and abs(rect.y0 - line_bbox[3]) < 5:
                                    is_underlined = True
                                    break
                        
                        line_text = line_text.strip()
                        if line_text:
                            structured_text.append({
                                "text": line_text,
                                "font": font_used,
                                "font_size": font_size,
                                "is_bold": is_bold,
                                "is_underlined": is_underlined,
                                "page": page_num + 1
                            })
        
        # print(f"Extracted {len(structured_text)} text elements")
        return structured_text
    except Exception as e:
        # print(f"Error extracting PDF text: {e}")
        raise e

In [3]:
hi = extract_structured_text_from_pdf(r'C:\Users\bilas\OneDrive\Documents\GENAI\my_web\iitg.pdf')

In [4]:
with open('output.json', 'w') as f:
    json.dump(hi, f, indent=4)

In [None]:
from unstructured.partition.auto import partition
from spacy import load
import networkx as nx
import matplotlib.pyplot as plt
from typing import List, Tuple

def create_knowledge_graph(text_file: str) -> nx.Graph:
    # Load SpaCy model for English
    nlp = load("en_core_web_sm")
    
    # Parse document using Unstructured
    elements = partition(filename=text_file)
    text = " ".join([str(element) for element in elements])
    
    # Process text with SpaCy
    doc = nlp(text)
    
    # Create graph
    G = nx.Graph()
    
    # Extract entities and relationships
    for sent in doc.sents:
        entities = [(ent.text, ent.label_) for ent in sent.ents]
        
        # Add nodes and edges
        for i in range(len(entities)):
            G.add_node(entities[i][0], type=entities[i][1])
            if i > 0:
                G.add_edge(entities[i-1][0], entities[i][0])
    
    return G

def visualize_graph(G: nx.Graph):
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G)
    nx.draw(G, pos, with_labels=True, node_color='lightblue', 
            node_size=1500, font_size=8)
    plt.title("Knowledge Graph")
    plt.show()

def main():
    # Example usage
    file_path = "output.txtc"  # Replace with your text file
    graph = create_knowledge_graph(file_path)
    visualize_graph(graph)

if __name__ == "__main__":
    main()

In [5]:
!pip install unstructured spacy networkx matplotlib

Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Using cached emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Using cached python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting backoff (from unstructured)
  Using cached backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting unstructured-client (from unstructured)
  D