In [None]:
!pip install pymupdf spacy
!python -m spacy download en_core_web_sm


Collecting pymupdf
  Downloading PyMuPDF-1.24.3-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.3
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Res

In [1]:
import fitz  # PyMuPDF
import spacy
from nltk.corpus import wordnet

# Ensure you have the WordNet data
import nltk
nltk.download('wordnet')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to split text into sentences
def split_text_into_sentences(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Function to analyze sentence for conditional ambiguity
def analyze_conditional_ambiguity(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    conditional_conjunctions = {"if", "when", "unless", "provided", "as long as", "in case", "even if", "only if", "assuming"}
    disjunctive_conjunctions = {"or", "either", "nor", "neither", "alternatively"}
    ambiguities = []

    # Check for conditional conjunctions
    if any(token.text.lower() in conditional_conjunctions for token in doc):
        # Analyze dependency parse tree
        for token in doc:
            if token.dep_ == "mark" and token.text.lower() in conditional_conjunctions:
                condition_clause = [t for t in token.head.subtree]
                condition_text = " ".join([t.text for t in condition_clause])
                if any(conj.text.lower() in disjunctive_conjunctions for conj in condition_clause):
                    ambiguities.append("Disjunctive conjunction in condition: " + condition_text)
                if any(cond.text.lower() in conditional_conjunctions for cond in condition_clause):
                    ambiguities.append("Nested conditional: " + condition_text)

    return ambiguities

# Function to check if a verb is transitive
def is_transitive(verb):
    synsets = wordnet.synsets(verb, pos=wordnet.VERB)
    for synset in synsets:
        if 'transitive' in synset.lexname():
            return True
    return False

# Function to analyze sentence for scope ambiguity
def analyze_scope_ambiguity(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    ambiguities = []

    for token in doc:
        if token.pos_ == "VERB" and is_transitive(token.lemma_):
            # Check for any object (direct or indirect) in the sentence
            objects = [child for child in doc if child.dep_ in {"dobj", "iobj"}]
            if not objects:
                ambiguities.append("Missing object for transitive verb: " + token.text)

    return ambiguities

# Main function to process PDF and identify ambiguities
def process_pdf_for_ambiguities(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    sentences = split_text_into_sentences(text)
    results = []

    for sentence in sentences:
        cond_ambiguities = analyze_conditional_ambiguity(sentence)
        scope_ambiguities = analyze_scope_ambiguity(sentence)
        if cond_ambiguities or scope_ambiguities:
            results.append({
                "sentence": sentence,
                "conditional_ambiguities": cond_ambiguities,
                "scope_ambiguities": scope_ambiguities
            })

    return results

# Example usage
pdf_path = "your_pdf.pdf"  # Path to your PDF file
ambiguities = process_pdf_for_ambiguities(pdf_path)
for item in ambiguities:
    print(f"Sentence: {item['sentence']}")
    print(f"Conditional Ambiguities: {item['conditional_ambiguities']}")
    print(f"Scope Ambiguities: {item['scope_ambiguities']}")
    print()


ModuleNotFoundError: No module named 'fitz'