# Create dataset

In [1]:
import PyPDF2
import pandas as pd

In [29]:
def extract_document_outline(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        
        # Get the document outline
        document_outline = pdf_reader.outline
        
        # Initialize lists to store levels, titles, page numbers, and parent titles
        levels = []
        titles = []
        page_numbers = []
        parent_titles = []
        
        # Define a recursive function to traverse the document outline
        def traverse_outline(outline_items, level=0, parent_title=None):
            for item in outline_items:
                if isinstance(item, list):
                    # Handle nested outlines
                    traverse_outline(item, level + 1, parent_title)
                elif isinstance(item, dict):
                    # Extract title and page number
                    title = item.get('/Title')
                    page_number = item.get('/Page')
                    
                    if title and page_number:
                        # Append to the lists
                        levels.append(level)
                        titles.append(title)
                        page_numbers.append(pdf_reader.get_page_number(page_number) + 1)  # Adjust index
                        parent_titles.append(parent_title)
                    
                    # Update parent title for child items
                    parent_title = title
                    
        # Start traversing the document outline
        traverse_outline(document_outline)
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame({
            'Level': levels,
            'Title': titles,
            'Page Number': page_numbers,
            'Parent Title': parent_titles
        })
        
        return df

In [50]:
def extract_text_from_page(pdf_path, page_numbers):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize list to store text
        all_text = []
        
        # Extract text from specified page range
        for page_num in page_numbers:
            page = pdf_reader.pages[page_num - 1]  # Adjust index
            text = page.extract_text()
            all_text.append(text)
        
        return all_text

In [51]:
def add_description(df, pdf_path):
    # Extract page numbers from the DataFrame
    page_numbers = df['Page Number'].tolist()
    
    # Extract text from each page
    all_text = extract_text_from_page(pdf_path, page_numbers)
    
    # Add description column to the DataFrame
    df['Description'] = all_text
    
    return df

In [27]:
os_book_path = './../dataset/pdf/OS_Main book.pdf'

In [45]:
document_outline_df = extract_document_outline(os_book_path)

In [46]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Cover,1,
1,0,Title Page,5,Cover
2,0,Copyright,6,Title Page
3,0,Preface,9,Copyright
4,0,Contents,23,Preface
...,...,...,...,...
905,2,Bibliography,1191,Further Reading
906,0,Credits,1193,PART TEN APPENDICES
907,0,Index,1195,Credits
908,0,Glossary,1237,Index


In [52]:
document_outline_df = add_description(document_outline_df, os_book_path)

In [53]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Cover,1,,
1,0,Title Page,5,Cover,OPERATING\nSYSTEM\nCONCEPTS\nABRAHAM SILBERSCH...
2,0,Copyright,6,Title Page,Publisher Laurie Rosatone \nEditorial Direc...
3,0,Preface,9,Copyright,Preface\nOperating systems are an essential pa...
4,0,Contents,23,Preface,Contents\nPART ONE\n OVERVIEW\nChapter 1 Intro...
...,...,...,...,...,...
905,2,Bibliography,1191,Further Reading,Further Reading 25\nMach uses lightweight proc...
906,0,Credits,1193,PART TEN APPENDICES,Credits\n•Figure 1.14: From Hennesy and Patter...
907,0,Index,1195,Credits,"Index\n4-byte pages, 363, 364\n32-byte memory,..."
908,0,Glossary,1237,Index,G-150-percent rule A statistical fi nding th...


In [55]:
document_outline_df.to_csv('document_outline.csv', index=False)