# Glossary Extraction

## Step 1: 
### Recreating the outline as made in hierarchical classifier:

In [5]:
import PyPDF2
import pandas as pd
from keybert import KeyBERT

def extract_text_from_page(pdf_path, page_numbers):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize list to store text
        all_text = []
        
        # Extract text from specified page range
        for page_num in page_numbers:
            page = pdf_reader.pages[page_num - 1]  # Adjust index
            text = page.extract_text()
            all_text.append(text)
        
        return all_text
    

In [6]:
def extract_document_outline(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        
        # Get the document outline
        document_outline = pdf_reader.outline
        
        # Initialize lists to store levels, titles, page numbers, and parent titles
        levels = []
        titles = []
        page_numbers = []
        parent_titles = []
        
        # Define a recursive function to traverse the document outline
        def traverse_outline(outline_items, level=0, parent_title=None):
            for item in outline_items:
                if isinstance(item, list):
                    # Handle nested outlines
                    traverse_outline(item, level + 1, parent_title)
                elif isinstance(item, dict):
                    # Extract title and page number
                    title = item.get('/Title')
                    page_number = item.get('/Page')
                    
                    if title and page_number:
                        # Append to the lists
                        levels.append(level)
                        titles.append(title)
                        page_numbers.append(pdf_reader.get_page_number(page_number) + 1)  # Adjust index
                        parent_titles.append(parent_title)
                    
                    # Update parent title for child items
                    parent_title = title
                    
        # Start traversing the document outline
        traverse_outline(document_outline)
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame({
            'Level': levels,
            'Title': titles,
            'Page Number': page_numbers,
            'Parent Title': parent_titles
        })
        
        return df

## Step 2:
### Using KeyBERT model to extract glossary:

In [8]:
def add_keywords_to_df(df, pdf_path):
    # Initialize KeyBERT model
    keybert_model = KeyBERT('distilbert-base-nli-mean-tokens')
    
    # Initialize list to store keywords
    keywords_list = []
    
    # Extract text from PDF based on page numbers and add keywords to DataFrame
    for index, row in df.iterrows():
        # Extract text from page
        page_text = extract_text_from_page(pdf_path, [row['Page Number']])
        
        # Paragraph extractor for text from hierarchical code:
        paragraph_text = page_text[0] if page_text else ""  # Get the text of the page, if available
        
        # Find keywords using KeyBERT
        keywords = keybert_model.extract_keywords(paragraph_text)
        
        # Append keywords to list
        keywords_list.append(keywords)
    
    # Add keywords to DataFrame
    df['Keywords'] = keywords_list
    
    return df

## Step 3.
### Filtering keywords, this is required since we get certain probabilities after training of the model and we want to obtain the ones with the reasonable soce

In [11]:
def filter_keywords(keywords_list, threshold=0.5):
    filtered_keywords = [keyword for keyword, score in keywords_list if score > threshold]
    return filtered_keywords

In [14]:
def filter_keywords_in_df(df, threshold=0.5):
    filtered_keyword = []
    for keyword_list in df['Keywords']:
        filtered_keyword_list = filter_keywords(keyword_list, threshold)
        filtered_keyword.append(filtered_keyword_list)
    df['Filtered_Keywords'] = filtered_keyword
    return  df

# For OS Book:

In [7]:
os_book_path = './../dataset/pdf/OS_Main book.pdf'

In [10]:
document_outline_df = extract_document_outline(os_book_path)
## This can take around 25-30 mins fyi:
add_keywords_to_df(document_outline_df, os_book_path)
document_outline_df

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens/resolve/main/tokenizer.json: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out.
Trying to resume download...


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7133c5e92830>>
Traceback (most recent call last):
  File "/home/malik/Documents/CENG/Forth Year/Second Semester/CENG416/416-Syllabus-Generator/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Unnamed: 0,Level,Title,Page Number,Parent Title,Keywords
0,0,Cover,1,,[]
1,0,Title Page,5,Cover,"[(8ftunjotufs, 0.4966), (6ojwfstjuz, 0.4865), ..."
2,0,Copyright,6,Title Page,"[(9781119320913, 0.3382), (ebook, 0.3375), (co..."
3,0,Preface,9,Copyright,[(wealsoincludeanoverviewofthefundamentaldatas...
4,0,Contents,23,Preface,"[(kerneldatastructures, 0.4298), (systemdebugg..."
...,...,...,...,...,...
905,2,Bibliography,1191,Further Reading,[(machprovidesgreatflexibilityinthedesignandim...
906,0,Credits,1193,PART TEN APPENDICES,"[(circlecopyrt1993, 0.5017), (circlecopyrt2002..."
907,0,Index,1195,Credits,"[(745mandatory, 0.4708), (364, 0.4406), (549, ..."
908,0,Glossary,1237,Index,"[(150, 0.3667), (50, 0.3471), (randomization, ..."


## Note: Cover, title page, copyright, preface and contents are not really useful for the glossary required for a syllabus
### So drop them:

In [12]:
## Remove cover, title page, copyright, preface, content
indices_to_drop = [0, 1, 2, 3, 4]
document_outline_df = document_outline_df.drop(indices_to_drop)
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Keywords
5,0,PART ONE OVERVIEW,29,Contents,"[(software, 0.4914), (computer, 0.4831), (hard..."
6,1,Chapter 1 Introduction,31,PART ONE OVERVIEW,"[(personalcomputers, 0.5907), (enterprisecompu..."
7,2,1.1 What Operating Systems Do,32,Chapter 1 Introduction,"[(overallcomputersystem, 0.7), (hardwareandcoo..."
8,3,1.1.1 User View,32,1.1 What Operating Systems Do,"[(overallcomputersystem, 0.7), (hardwareandcoo..."
9,3,1.1.2 System View,33,1.1.1 User View,[(manyusersinteractwithmobiledevicessuchassmar...
...,...,...,...,...,...
905,2,Bibliography,1191,Further Reading,[(machprovidesgreatflexibilityinthedesignandim...
906,0,Credits,1193,PART TEN APPENDICES,"[(circlecopyrt1993, 0.5017), (circlecopyrt2002..."
907,0,Index,1195,Credits,"[(745mandatory, 0.4708), (364, 0.4406), (549, ..."
908,0,Glossary,1237,Index,"[(150, 0.3667), (50, 0.3471), (randomization, ..."


## Selecting the threshold: 
### By looking at the keywords from section, and viewing the book, the words that seem to be seen as keywords are the ones with higher probability that 0.45 - 0.48, so experiment is made on the lower bound i.e 0.45

In [15]:
filter_keywords_in_df(document_outline_df, 0.45)
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Keywords,Filtered_Keywords
5,0,PART ONE OVERVIEW,29,Contents,"[(software, 0.4914), (computer, 0.4831), (hard...","[software, computer, hardware]"
6,1,Chapter 1 Introduction,31,PART ONE OVERVIEW,"[(personalcomputers, 0.5907), (enterprisecompu...","[personalcomputers, enterprisecomputers, inord..."
7,2,1.1 What Operating Systems Do,32,Chapter 1 Introduction,"[(overallcomputersystem, 0.7), (hardwareandcoo...","[overallcomputersystem, hardwareandcoordinates..."
8,3,1.1.1 User View,32,1.1 What Operating Systems Do,"[(overallcomputersystem, 0.7), (hardwareandcoo...","[overallcomputersystem, hardwareandcoordinates..."
9,3,1.1.2 System View,33,1.1.1 User View,[(manyusersinteractwithmobiledevicessuchassmar...,[manyusersinteractwithmobiledevicessuchassmart...
...,...,...,...,...,...,...
905,2,Bibliography,1191,Further Reading,[(machprovidesgreatflexibilityinthedesignandim...,[machprovidesgreatflexibilityinthedesignandimp...
906,0,Credits,1193,PART TEN APPENDICES,"[(circlecopyrt1993, 0.5017), (circlecopyrt2002...","[circlecopyrt1993, circlecopyrt2002]"
907,0,Index,1195,Credits,"[(745mandatory, 0.4708), (364, 0.4406), (549, ...",[745mandatory]
908,0,Glossary,1237,Index,"[(150, 0.3667), (50, 0.3471), (randomization, ...",[]


## Finally, keep it as csv for future references

In [17]:
document_outline_df.to_csv("OS_glossary_dataframe.csv", index=False)