In [None]:
pip install PyPDF2




In [None]:
import PyPDF2

def split_pdf_on_word(input_pdf_path, output_pdf1_path, output_pdf2_path, split_word="SOLUTIONS"):
    try:
        # Open the input PDF
        with open(input_pdf_path, 'rb') as input_pdf:
            pdf_reader = PyPDF2.PdfReader(input_pdf)
            total_pages = len(pdf_reader.pages)

            split_index = None
            # Iterate through the pages to find the split word
            for i, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                if split_word in text:
                    split_index = i
                    break

            if split_index is None:
                print(f"The word '{split_word}' was not found in the PDF.")
                return

            # Split the PDF into two parts
            with open(output_pdf1_path, 'wb') as output_pdf1, open(output_pdf2_path, 'wb') as output_pdf2:
                pdf_writer1 = PyPDF2.PdfWriter()
                pdf_writer2 = PyPDF2.PdfWriter()

                # Write pages up to the split index to the first output PDF
                for page in pdf_reader.pages[:split_index]:
                    pdf_writer1.add_page(page)
                pdf_writer1.write(output_pdf1)

                # Write pages from the split index onward to the second output PDF
                for page in pdf_reader.pages[split_index:]:
                    pdf_writer2.add_page(page)
                pdf_writer2.write(output_pdf2)

            print(f"PDF split successfully into '{output_pdf1_path}' and '{output_pdf2_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_pdf_path = "Age.pdf"
output_pdf1_path = "output_part1.pdf"
output_pdf2_path = "output_part2.pdf"
split_pdf_on_word(input_pdf_path, output_pdf1_path, output_pdf2_path)


PDF split successfully into 'output_part1.pdf' and 'output_part2.pdf'.


In [None]:
pip install pdfplumber



In [None]:
import pdfplumber
import re

def extract_ordered_list_from_pdf(pdf_path, question_pattern=r"^\d+\.", option_pattern=r"^[a-d]\."):
 
    extracted_data = []
    current_question = None

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            # Define the bounding boxes for left and right columns
            width = page.width
            height = page.height
            left_bbox = (0, 0, width / 2, height)
            right_bbox = (width / 2, 0, width, height)

            # Extract text from both columns
            left_text = page.within_bbox(left_bbox).extract_text() or ""
            right_text = page.within_bbox(right_bbox).extract_text() or ""

            # Combine text from both columns in reading order
            full_text = (left_text + "\n" + right_text).strip()
            lines = full_text.split("\n")

            for line in lines:
                line = line.strip()

                # Check if the line is a question (matches question pattern)
                if re.match(question_pattern, line):
                    # Save the previous question if it exists
                    if current_question:
                        extracted_data.append(current_question)

                    # Start a new question
                    current_question = {"question": line, "options": []}

                # Check if the line is an option (matches option pattern)
                elif re.match(option_pattern, line) and current_question:
                    current_question["options"].append(line)

                # Handle continuation lines for long questions
                elif current_question and not re.match(option_pattern, line):
                    current_question["question"] += " " + line

        # Add the last question if it exists
        if current_question:
            extracted_data.append(current_question)

    return extracted_data


# Example usage
pdf_path = "output_part1.pdf"  # Path to the PDF containing questions
questions_with_options = extract_ordered_list_from_pdf(pdf_path)

# Save extracted questions and options to a text file for review
with open("extracted_questions_and_options.txt", "w", encoding="utf-8") as file:
    for i, item in enumerate(questions_with_options, start=1):
        file.write(f"Question {i}:\n{item['question']}\n")
        for option in item['options']:
            file.write(f"  {option}\n")
        file.write("\n")

print(f"Extracted {len(questions_with_options)} questions with options. Saved to 'extracted_questions_and_options.txt'.")


Extracted 31 questions with options. Saved to 'extracted_questions_and_options.txt'.


In [None]:
import pdfplumber
import re

def extract_answers_from_pdf(pdf_path, answer_numbering_pattern=r"^\d+\.", continuation_pattern=None):
   
    answers = {}
    current_answer_number = None
    current_answer_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            # Define bounding boxes for left and right columns
            width = page.width
            height = page.height
            left_bbox = (0, 0, width / 2, height)
            right_bbox = (width / 2, 0, width, height)

            # Extract text from both columns
            left_text = page.within_bbox(left_bbox).extract_text() or ""
            right_text = page.within_bbox(right_bbox).extract_text() or ""

            # Combine text from both columns in reading order
            full_text = (left_text + "\n" + right_text).strip()
            lines = full_text.split("\n")

            for line in lines:
                line = line.strip()

                # Check if the line starts with an answer number
                match = re.match(answer_numbering_pattern, line)
                if match:
                    # Save the previous answer if it exists
                    if current_answer_number is not None:
                        answers[current_answer_number] = current_answer_text.strip()

                    # Start a new answer
                    current_answer_number = match.group().strip()
                    current_answer_text = line
                else:
                    # Append to the current answer if it's a continuation line
                    if current_answer_number is not None:
                        current_answer_text += " " + line

        # Save the last answer if it exists
        if current_answer_number is not None:
            answers[current_answer_number] = current_answer_text.strip()

    return answers


# Example usage
pdf_path = "output_part2.pdf"  # Path to the PDF containing answers
answers = extract_answers_from_pdf(pdf_path)

# Save extracted answers to a text file for review
with open("extracted_answers.txt", "w", encoding="utf-8") as file:
    for answer_number, answer_text in answers.items():
        file.write(f"Answer {answer_number}:\n{answer_text}\n\n")

print(f"Extracted {len(answers)} answers. Saved to 'extracted_answers.txt'.")


Extracted 31 answers. Saved to 'extracted_answers.txt'.


In [None]:
def merge_questions_and_answers(questions_file, answers_file, output_file):
    
    with open(questions_file, "r", encoding="utf-8") as q_file, \
         open(answers_file, "r", encoding="utf-8") as a_file, \
         open(output_file, "w", encoding="utf-8") as out_file:

        questions = q_file.read().strip().split("\n\n")
        answers = a_file.read().strip().split("\n\n")

        if len(questions) != len(answers):
            print(f"Warning: Number of questions ({len(questions)}) does not match number of answers ({len(answers)}).")

        # Merge questions and answers
        for i in range(max(len(questions), len(answers))):
            # Write the question
            if i < len(questions):
                out_file.write(f"{questions[i]}\n\n")
            else:
                out_file.write(f"Question {i + 1}:\n(Missing question)\n\n")

            # Write the answer
            if i < len(answers):
                out_file.write(f"{answers[i]}\n\n")
            else:
                out_file.write(f"Answer {i + 1}:\n(Missing answer)\n\n")

    print(f"Merged content written to '{output_file}'.")


# Example usage
questions_file = "extracted_questions_and_options.txt"  # File containing extracted questions
answers_file = "extracted_answers.txt"                 # File containing extracted answers
output_file = "merged_questions_and_answers.txt"       # Output file for merged content

merge_questions_and_answers(questions_file, answers_file, output_file)


Merged content written to 'merged_questions_and_answers.txt'.


In [None]:
pip install reportlab




In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

def text_to_pdf_reportlab(input_text_file, output_pdf_file):
   
    # Create a PDF document and set page size
    doc = SimpleDocTemplate(output_pdf_file, pagesize=letter)

    # Use a style sheet for the text
    styles = getSampleStyleSheet()
    normal_style = styles['Normal']
    normal_style.fontName = 'Helvetica'
    normal_style.fontSize = 12

    # Define the flowable content for the PDF
    content = []

    # Open the text file and read the content
    with open(input_text_file, "r", encoding="utf-8") as file:
        for line in file:
            # Create a paragraph for each line with word wrapping
            para = Paragraph(line.strip(), normal_style)
            content.append(para)

            # Add a spacer after each question/answer for clarity
            content.append(Spacer(1, 12))  # Adjust the second value for more or less space

    # Build the PDF with the content
    doc.build(content)
    print(f"PDF created successfully: {output_pdf_file}")


# Example usage
input_text_file = "merged_questions_and_answers.txt"  # Merged text file
output_pdf_file = "merged_questions_and_answers.pdf"  # Output PDF file

text_to_pdf_reportlab(input_text_file, output_pdf_file)


PDF created successfully: merged_questions_and_answers.pdf
