In [8]:
import fitz  # PyMuPDF
import pandas as pd
import os

def extract_text_between_keywords(pdf_path, start_keyword, end_keyword):
    try:
        with fitz.open(pdf_path) as doc:
            full_text = ""
            for page in doc:
                blocks = page.get_text("blocks")
                blocks = sorted(blocks, key=lambda b: b[1])  # sort top to bottom

                for block in blocks:
                    x0, y0, x1, y1, text, *_ = block
                    if y0 < 50 or y1 > (page.rect.height - 50):
                        continue  # skip header/footer
                    full_text += text.strip() + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

    # Normalize for keyword search
    norm_text = full_text.lower()
    start_idx = norm_text.find(start_keyword.lower())
    end_idx = norm_text.find(end_keyword.lower(), start_idx + len(start_keyword))

    if start_idx == -1 or end_idx == -1:
        return None

    extracted = full_text[start_idx + len(start_keyword):end_idx].strip()
    return extracted


# === INPUT SECTION ===
pdf_folder = "C://Users//DELL//Documents//My Tableau Repository//Workbooks//Practicum"
keyword_pairs = [
    ("CHALLENGES FACED", "TEXT PREPROCESSING"),
    ("TEXT PREPROCESSING", "MODELLING"),
    ("MODELLING", "CONCLUSION")
]


results = []
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, file)
        for start_kw, end_kw in keyword_pairs:
            desc = extract_text_between_keywords(file_path, start_kw, end_kw)
            results.append({
                "File Name": file,
                "Start Keyword": start_kw,
                "End Keyword": end_kw,
                "Description": desc if desc else "Not Found"
            })


df = pd.DataFrame(results)
print(df)
df.to_csv("extracted_pdf_sections.csv", index=False)


                    File Name       Start Keyword         End Keyword  \
0  final peoject document.pdf    CHALLENGES FACED  TEXT PREPROCESSING   
1  final peoject document.pdf  TEXT PREPROCESSING           MODELLING   
2  final peoject document.pdf           MODELLING          CONCLUSION   
3  final project document.pdf    CHALLENGES FACED  TEXT PREPROCESSING   
4  final project document.pdf  TEXT PREPROCESSING           MODELLING   
5  final project document.pdf           MODELLING          CONCLUSION   

                                         Description  
0  The primary challenges in the project arise fr...  
1  The unstructured nature of the data can be tra...  
2  To classify the problem description provided b...  
3  The primary challenges in the project arise fr...  
4  The unstructured nature of the data can be tra...  
5  To classify the problem description provided b...  
