<a href="https://colab.research.google.com/github/JocelynAbey/JocelynAbey/blob/main/PDFTOJSON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re
import json
import logging
from pathlib import Path
from typing import List, Dict, Tuple

import pandas as pd
# PDF processing libraries (with auto-install, useful in Colab)
try:
    import PyPDF2
    import pdfplumber
    import fitz  # PyMuPDF
except ImportError:
    import subprocess
    import sys

    packages = ["PyPDF2", "pdfplumber", "PyMuPDF"]
    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    import PyPDF2
    import pdfplumber
    import fitz

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class PDFToDatasetConverter:
    """
    Converts PDF documents containing Q&A pairs into chatbot training datasets.
    """

    def __init__(self):
        # Regex patterns for different Q&A formats
        self.qa_patterns = [
            # Pattern 1: Qnum: ... Anum: ...   (Q1: ... A1: ...)
            r"Q(\d+):\s*(.+?)\s*A\1:\s*(.+?)(?=Q\d+:|$)",

            # Pattern 2: Q: ... A: ...
            r"Q\s*:\s*(.+?)\s*A\s*:\s*(.+?)(?=Q\s*:|$)",

            # Pattern 3: Question: ... Answer: ...
            r"Question\s*:\s*(.+?)\s*Answer\s*:\s*(.+?)(?=Question\s*:|$)",

            # Pattern 4: Simple Q&A without prefixes (any sentence ending with ? and following text)
            r"([^.!?]*\?)\s*([^?]+?)(?=[^.!?]*\?|$)",
        ]

        # Category keywords for automatic categorization
        self.category_keywords = {
            "departments": [
                "department", "dept", "faculty", "school", "division",
                "cse", "ece", "it", "civil", "ere"
            ],
            "fees": [
                "fee", "cost", "tuition", "payment", "charge", "price",
                "amount", "money", "rupees", "₹"
            ],
            "facilities": [
                "library", "canteen", "hostel", "parking", "gym", "lab",
                "toilet", "restroom", "wifi"
            ],
            "contact": [
                "contact", "phone", "email", "address", "office",
                "reception", "call", "reach"
            ],
            "admissions": [
                "admission", "application", "entrance", "exam", "keam",
                "apply", "eligibility"
            ],
            "programs": [
                "program", "course", "degree", "btech", "mtech", "diploma",
                "engineering"
            ],
            "location": [
                "where", "located", "address", "place", "building",
                "floor", "room"
            ],
            "timing": [
                "time", "hours", "schedule", "timing", "open", "close", "when"
            ],
            "leadership": [
                "principal", "hod", "head", "director", "dean",
                "faculty", "professor"
            ],
            "placements": [
                "placement", "job", "career", "company", "recruiter",
                "employment"
            ],
            "activities": [
                "club", "association", "society", "event", "activity", "sports"
            ],
        }

    # ---------------- PDF TEXT EXTRACTION ---------------- #

    def extract_text_pypdf2(self, pdf_path: str) -> str:
        """Extract text using PyPDF2."""
        try:
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                return text
        except Exception as e:
            logger.error(f"PyPDF2 extraction failed: {e}")
            return ""

    def extract_text_pdfplumber(self, pdf_path: str) -> str:
        """Extract text using pdfplumber (better for complex layouts)."""
        try:
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
            return text
        except Exception as e:
            logger.error(f"pdfplumber extraction failed: {e}")
            return ""

    def extract_text_pymupdf(self, pdf_path: str) -> str:
        """Extract text using PyMuPDF (fast and accurate)."""
        try:
            doc = fitz.open(pdf_path)
            text = ""
            for page in doc:
                text += page.get_text() + "\n"
            doc.close()
            return text
        except Exception as e:
            logger.error(f"PyMuPDF extraction failed: {e}")
            return ""

    def extract_text_from_pdf(self, pdf_path: str, method: str = "auto") -> str:
        """
        Extract text from PDF using specified method.

        method: 'auto', 'pymupdf', 'pdfplumber', or 'pypdf2'
        """
        logger.info(f"Extracting text from {pdf_path} using method: {method}")

        if method == "auto":
            methods = [
                ("pymupdf", self.extract_text_pymupdf),
                ("pdfplumber", self.extract_text_pdfplumber),
                ("pypdf2", self.extract_text_pypdf2),
            ]
            for method_name, func in methods:
                text = func(pdf_path)
                if text and text.strip():
                    logger.info(f"Successfully extracted text using {method_name}")
                    return text
            logger.error("All extraction methods failed")
            return ""

        elif method == "pymupdf":
            return self.extract_text_pymupdf(pdf_path)
        elif method == "pdfplumber":
            return self.extract_text_pdfplumber(pdf_path)
        elif method == "pypdf2":
            return self.extract_text_pypdf2(pdf_path)
        else:
            raise ValueError(f"Unknown extraction method: {method}")

    # ---------------- TEXT CLEANING & Q&A EXTRACTION ---------------- #

    def clean_text(self, text: str) -> str:
        """Clean and normalize extracted text."""
        # Collapse multiple spaces/newlines into single space
        text = re.sub(r"\s+", " ", text)

        # (Optional) further cleaning can be added here if needed

        return text.strip()

    def clean_qa_text(self, text: str) -> str:
        """Clean individual question or answer text."""
        text = text.strip()

        prefixes = ["Q:", "A:", "Question:", "Answer:", "Ans:", "•", "-", "*"]
        for prefix in prefixes:
            if text.startswith(prefix):
                text = text[len(prefix):].strip()

        # Remove trailing colons
        text = text.rstrip(":").strip()

        # If there's a '?' inside but not at the end, move it to the end
        if "?" in text and not text.endswith("?"):
            text = text.replace("?", "")
            text = text.strip() + "?"

        return text

    def is_valid_qa_pair(self, question: str, answer: str) -> bool:
        """Validate if a Q&A pair is meaningful."""
        # Minimum length
        if len(question) < 5 or len(answer) < 5:
            return False

        # Maximum length (avoid gigantic chunks)
        if len(question) > 500 or len(answer) > 2000:
            return False

        # Question should look like a question
        question_indicators = [
            "what", "where", "when", "how", "who", "why", "which",
            "can", "is", "are", "do", "does", "?", "will", "shall"
        ]
        if not any(ind in question.lower() for ind in question_indicators):
            return False

        # Answer should be at least 3 words
        if len(answer.split()) < 3:
            return False

        return True

    def extract_qa_pairs(self, text: str) -> List[Tuple[str, str]]:
        """
        Extract Q&A pairs from text using multiple patterns.
        Returns a list of (question, answer) tuples.
        """
        qa_pairs: List[Tuple[str, str]] = []

        for pattern in self.qa_patterns:
            matches = re.findall(pattern, text, flags=re.IGNORECASE | re.DOTALL)
            for match in matches:
                # Handle numbered Q1/A1 pattern
                if len(match) == 3:
                    question, answer = match[1], match[2]
                elif len(match) == 2:
                    question, answer = match
                else:
                    continue

                question = self.clean_qa_text(question)
                answer = self.clean_qa_text(answer)

                if self.is_valid_qa_pair(question, answer):
                    qa_pairs.append((question, answer))

        # Remove duplicates while preserving order
        seen = set()
        unique_pairs: List[Tuple[str, str]] = []
        for q, a in qa_pairs:
            key = (q.lower().strip(), a.lower().strip())
            if key not in seen:
                seen.add(key)
                unique_pairs.append((q, a))

        logger.info(f"Extracted {len(unique_pairs)} unique Q&A pairs")
        return unique_pairs

    # ---------------- CATEGORIZATION & DATASET CREATION ---------------- #

    def categorize_qa_pair(self, question: str, answer: str) -> str:
        """Automatically categorize Q&A pair based on keywords."""
        text = (question + " " + answer).lower()

        category_scores: Dict[str, int] = {}
        for category, keywords in self.category_keywords.items():
            score = sum(1 for kw in keywords if kw in text)
            if score > 0:
                category_scores[category] = score

        if category_scores:
            # Return category with highest score
            return max(category_scores, key=category_scores.get)
        else:
            return "general"

    def create_dataset(self, qa_pairs: List[Tuple[str, str]]) -> List[Dict]:
        """Convert Q&A pairs to chatbot dataset format."""
        dataset: List[Dict] = []
        for question, answer in qa_pairs:
            category = self.categorize_qa_pair(question, answer)
            dataset.append(
                {
                    "question": question,
                    "answer": answer,
                    "category": category,
                }
            )
        return dataset

    # ---------------- DATASET ENHANCEMENT ---------------- #

    def generate_question_variations(self, question: str) -> List[str]:
        """Generate simple variations of a question."""
        variations = [question]

        transformations = [
            # Remove some initial question forms
            (r"^What is ", ""),
            (r"^Where is ", ""),
            (r"^How can I ", ""),
            (r"\?$", ""),
            # Add prefixes
            ("", "Tell me about "),
            ("", "Information about "),
            ("", "Details about "),
        ]

        for old_pattern, new_pattern in transformations:
            if old_pattern:
                new_q = re.sub(old_pattern, new_pattern, question, flags=re.IGNORECASE)
            else:
                new_q = new_pattern + question

            if new_q != question and new_q not in variations:
                variations.append(new_q)

        return variations

    def enhance_dataset(self, dataset: List[Dict]) -> List[Dict]:
        """Enhance dataset with question variations."""
        enhanced = list(dataset)  # copy
        variations_list = []

        for item in dataset:
            q = item["question"]
            a = item["answer"]
            c = item["category"]

            q_variations = self.generate_question_variations(q)
            for v in q_variations:
                if v != q:
                    variations_list.append(
                        {
                            "question": v,
                            "answer": a,
                            "category": c,
                        }
                    )

        enhanced.extend(variations_list)
        logger.info(f"Enhanced dataset from {len(dataset)} to {len(enhanced)} entries")
        return enhanced

    # ---------------- SAVE & STATS ---------------- #

    def save_dataset(self, dataset: List[Dict], output_path: str, format: str = "json"):
        """Save dataset to JSON or CSV."""
        format = format.lower()
        if format == "json":
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(dataset, f, indent=2, ensure_ascii=False)
        elif format == "csv":
            df = pd.DataFrame(dataset)
            df.to_csv(output_path, index=False, encoding="utf-8")
        else:
            raise ValueError(f"Unsupported format: {format}")

        logger.info(f"Dataset saved to {output_path} ({len(dataset)} entries)")

    def generate_statistics(self, dataset: List[Dict]) -> Dict:
        """Generate simple statistics about the dataset."""
        if not dataset:
            return {}

        categories = [item["category"] for item in dataset]
        category_counts: Dict[str, int] = {}
        for c in categories:
            category_counts[c] = category_counts.get(c, 0) + 1

        avg_q_len = sum(len(item["question"]) for item in dataset) / len(dataset)
        avg_a_len = sum(len(item["answer"]) for item in dataset) / len(dataset)

        stats = {
            "total_pairs": len(dataset),
            "categories": category_counts,
            "avg_question_length": round(avg_q_len, 2),
            "avg_answer_length": round(avg_a_len, 2),
        }
        return stats

    # ---------------- MAIN CONVERSION API ---------------- #

    def convert_pdf_to_dataset(
        self,
        pdf_path: str,
        output_path: str | None = None,
        format: str = "json",
        extraction_method: str = "auto",
        enhance: bool = True,
    ) -> List[Dict]:
        """
        High-level function: PDF -> dataset (list of dicts).
        Optionally saves to JSON/CSV if output_path is given.
        """
        logger.info("Starting PDF to dataset conversion")

        # Extract text
        text = self.extract_text_from_pdf(pdf_path, method=extraction_method)
        if not text:
            raise ValueError("Failed to extract text from PDF")

        # Clean text
        cleaned_text = self.clean_text(text)

        # Extract Q&A pairs
        qa_pairs = self.extract_qa_pairs(cleaned_text)
        if not qa_pairs:
            raise ValueError("No Q&A pairs found in PDF")

        # Convert to dataset
        dataset = self.create_dataset(qa_pairs)

        # Enhance (optional)
        if enhance:
            dataset = self.enhance_dataset(dataset)

        # Save if path provided
        if output_path:
            # If output_path has no extension, infer from format
            output_path = str(output_path)
            if "." not in Path(output_path).name:
                output_path = output_path + f".{format}"
            self.save_dataset(dataset, output_path, format=format)

        # Stats
        stats = self.generate_statistics(dataset)
        logger.info(f"Dataset statistics: {stats}")

        return dataset


In [7]:
from google.colab import files

uploaded = files.upload()  # choose dataset.pdf from your computer
print(uploaded.keys())

Saving dataset2.pdf to dataset2.pdf
dict_keys(['dataset2.pdf'])


In [8]:
converter = PDFToDatasetConverter()

pdf_path = "dataset2.pdf"                  # or the exact filename you uploaded
output_path = "chatbot_dataset.json"              # JSON will be saved in /content/

dataset = converter.convert_pdf_to_dataset(
    pdf_path=pdf_path,
    output_path=output_path,
    format="json",         # or "csv" if you want CSV instead
    extraction_method="auto",
    enhance=True           # set False if you don't want extra question variations
)

print("Done! JSON saved as:", output_path)
print("Total Q&A entries in dataset:", len(dataset))
dataset[:3]   # show first 3 entries


Done! JSON saved as: chatbot_dataset.json
Total Q&A entries in dataset: 1199


[{'question': 'What is LBS Institute of Technology for Women (LBSITW), Poojappura?',
  'answer': "LBS Institute of Technology for Women (LBSITW), Poojappura is a government women's engineering college located in Poojappura, Thiruvananthapuram, Kerala, India. Established in 2001, it is affiliated to APJ Abdul Kalam Technological University (KTU) and is the first government engineering college exclusively for women in Kerala, offering undergraduate, postgraduate, and doctoral programs in engineering disciplines.",
  'category': 'programs'},
 {'question': 'When was LBS Institute of Technology for Women (LBSITW), Poojappura established?',
  'answer': 'LBS Institute of Technology for Women (LBSITW), Poojappura was established in 2001 as a government cost-sharing institution under the LBS Centre for Science and Technology.',
  'category': 'departments'},
 {'question': 'What makes LBS Institute of Technology for Women (LBSITW), Poojappura unique in Kerala?',
  'answer': "LBSITW, Poojappura is

In [10]:
from google.colab import files
files.download("chatbot_dataset.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>