# Create dataset

In [1]:
import PyPDF2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
def extract_document_outline(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        
        # Get the document outline
        document_outline = pdf_reader.outline
        
        # Initialize lists to store levels, titles, page numbers, and parent titles
        levels = []
        titles = []
        page_numbers = []
        parent_titles = []
        
        # Define a recursive function to traverse the document outline
        def traverse_outline(outline_items, level=0, parent_title=None):
            for item in outline_items:
                if isinstance(item, list):
                    # Handle nested outlines
                    traverse_outline(item, level + 1, parent_title)
                elif isinstance(item, dict):
                    # Extract title and page number
                    title = item.get('/Title')
                    page_number = item.get('/Page')
                    
                    if title and page_number:
                        # Append to the lists
                        levels.append(level)
                        titles.append(title)
                        page_numbers.append(pdf_reader.get_page_number(page_number) + 1)  # Adjust index
                        parent_titles.append(parent_title)
                    
                    # Update parent title for child items
                    parent_title = title
                    
        # Start traversing the document outline
        traverse_outline(document_outline)
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame({
            'Level': levels,
            'Title': titles,
            'Page Number': page_numbers,
            'Parent Title': parent_titles
        })
        
        return df

In [50]:
def extract_text_from_page(pdf_path, page_numbers):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize list to store text
        all_text = []
        
        # Extract text from specified page range
        for page_num in page_numbers:
            page = pdf_reader.pages[page_num - 1]  # Adjust index
            text = page.extract_text()
            all_text.append(text)
        
        return all_text

In [51]:
def add_description(df, pdf_path):
    # Extract page numbers from the DataFrame
    page_numbers = df['Page Number'].tolist()
    
    # Extract text from each page
    all_text = extract_text_from_page(pdf_path, page_numbers)
    
    # Add description column to the DataFrame
    df['Description'] = all_text
    
    return df

In [27]:
os_book_path = './../dataset/pdf/OS_Main book.pdf'

In [45]:
document_outline_df = extract_document_outline(os_book_path)

In [46]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Cover,1,
1,0,Title Page,5,Cover
2,0,Copyright,6,Title Page
3,0,Preface,9,Copyright
4,0,Contents,23,Preface
...,...,...,...,...
905,2,Bibliography,1191,Further Reading
906,0,Credits,1193,PART TEN APPENDICES
907,0,Index,1195,Credits
908,0,Glossary,1237,Index


In [52]:
document_outline_df = add_description(document_outline_df, os_book_path)

In [53]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Cover,1,,
1,0,Title Page,5,Cover,OPERATING\nSYSTEM\nCONCEPTS\nABRAHAM SILBERSCH...
2,0,Copyright,6,Title Page,Publisher Laurie Rosatone \nEditorial Direc...
3,0,Preface,9,Copyright,Preface\nOperating systems are an essential pa...
4,0,Contents,23,Preface,Contents\nPART ONE\n OVERVIEW\nChapter 1 Intro...
...,...,...,...,...,...
905,2,Bibliography,1191,Further Reading,Further Reading 25\nMach uses lightweight proc...
906,0,Credits,1193,PART TEN APPENDICES,Credits\n•Figure 1.14: From Hennesy and Patter...
907,0,Index,1195,Credits,"Index\n4-byte pages, 363, 364\n32-byte memory,..."
908,0,Glossary,1237,Index,G-150-percent rule A statistical fi nding th...


In [55]:
document_outline_df.to_csv('document_outline.csv', index=False)

## Data Perprocessing:

In [1]:
from pathlib import Path
import pandas as pd
data_path = Path("./document_outline.csv")
data_df = pd.read_csv(data_path)

In [2]:
import regex

def preprocess_df(input_df, X, Y):
    df = input_df.copy()
    
    df['Description'] = input_df[X].apply(lambda row: ' '.join(row.map(str)), axis=1)
    
    pattern = regex.compile(r'\W+', regex.UNICODE)
    df['Description'] = df['Description'].apply(lambda x: pattern.sub(' ', str(x)))

    df['cat1'] = ''
    df['cat2'] = ''
    df['cat3'] = ''
    
    for index, row in df.iterrows():
        if row['Level'] == 0:
            df.at[index, 'cat1'] = row['Title']
        elif row['Level'] == 1:
            parent_title = df.loc[df.index < index, 'Title'].iloc[-1] 
            df.at[index, 'cat1'] = parent_title
            df.at[index, 'cat2'] = row['Title']
        elif row['Level'] == 2:
            parent_title_level1 = df.loc[df.index < index, 'Title'].iloc[-2] 
            parent_title_level2 = df.loc[df.index < index, 'Title'].iloc[-1]  
            df.at[index, 'cat1'] = parent_title_level1
            df.at[index, 'cat2'] = parent_title_level2
            df.at[index, 'cat3'] = row['Title']
    
    return df[['Description', 'cat1', 'cat2', 'cat3']]


In [3]:
data_df.head()

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Cover,1,,
1,0,Title Page,5,Cover,OPERATING\nSYSTEM\nCONCEPTS\nABRAHAM SILBERSCH...
2,0,Copyright,6,Title Page,Publisher Laurie Rosatone \nEditorial Direc...
3,0,Preface,9,Copyright,Preface\nOperating systems are an essential pa...
4,0,Contents,23,Preface,Contents\nPART ONE\n OVERVIEW\nChapter 1 Intro...


In [4]:
X = ["Description"]
Y = ["Title", "Parent Title", "Level"]
preprocessed_df = preprocess_df(data_df, X, Y)
preprocessed_df.to_csv("preprocessed_data.csv")
preprocessed_df

Unnamed: 0,Description,cat1,cat2,cat3
0,,Cover,,
1,OPERATING SYSTEM CONCEPTS ABRAHAM SILBERSCHATZ...,Title Page,,
2,Publisher Laurie Rosatone Editorial Director D...,Copyright,,
3,Preface Operating systems are an essential par...,Preface,,
4,Contents PART ONE OVERVIEW Chapter 1 Introduct...,Contents,,
...,...,...,...,...
905,Further Reading 25 Mach uses lightweight proce...,D.8 Summary,Further Reading,Bibliography
906,Credits Figure 1 14 From Hennesy and Patterson...,Credits,,
907,Index 4 byte pages 363 364 32 byte memory 363 ...,Index,,
908,G 150 percent rule A statistical fi nding that...,Glossary,,


In [5]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
language_stop_words = stopwords.words('english')

vectorizer = TfidfVectorizer()
vectorizer.fit(preprocessed_df['Description'].apply(lambda x: np.str_(x)))
X_train = vectorizer.transform(preprocessed_df['Description'].apply(lambda x: np.str_(x)))

oe = OrdinalEncoder()
Y_columns = ['cat1', 'cat2', 'cat3']
Y_train = oe.fit_transform(preprocessed_df[Y_columns].values)

print('X training shape', X_train.shape, X_train.dtype)
print('Y training shape', Y_train.shape, Y_train.dtype)

X training shape (910, 31684) float64
Y training shape (910, 3) float64


## Analyzing dataset

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

### Heat Map:
**use**:  visualizing the correlation between the categories (cat1, cat2, cat3).

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_category_relations(df):
    # Perform one-hot encoding on categorical variables
    encoded_df = pd.get_dummies(df[['cat1', 'cat2', 'cat3']], drop_first=True)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(encoded_df.corr(), annot=True, cmap='coolwarm', linewidths=.5)
    plt.title('Correlation Heatmap after One-Hot Encoding')
    plt.show()

plot_category_relations(preprocessed_df)

KeyboardInterrupt: 

Error in callback <function _draw_all_if_interactive at 0x7181eb419ea0> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x7181d638ae60> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

## Classification

In [57]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import SGDClassifier

clf=ClassifierChain(SGDClassifier(random_state=0, class_weight='balanced', n_jobs=-1))
clf.fit(X_train, Y_train)

## Evaluation

In [59]:
from sklearn.metrics import jaccard_score, f1_score, make_scorer

def concat_categories_level2(Y):
    return np.apply_along_axis(lambda a: str(a[0]) + '-' + str(a[1]), 1, Y)

def concat_categories_level3(Y):
    return np.apply_along_axis(lambda a: str(a[0]) + '-' + str(a[1]+'-'+ str(a[2])), 1, Y)

def js_0(y, y_pred, **kwargs):
    return jaccard_score(y[:,0], y_pred[:,0], average='micro')

def js_1(y, y_pred, **kwargs):
    return jaccard_score(y[:,1], y_pred[:,1], average='micro')

def f1_0(y, y_pred, **kwargs):
    return f1_score(y[:,0], y_pred[:,0], average='micro')

def f1_1(y, y_pred, **kwargs):
    return f1_score(y[:,1], y_pred[:,1], average='micro')

# Score for predicting 'category_1-category_2' (concatenated strings)
def js_01(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def f1_01(y, y_pred, **kwargs):
    return f1_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def js_001(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

def f1_001(y, y_pred, **kwargs):
    return f1_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

js_0_scorer = make_scorer(score_func=js_0, greater_is_better=True)
js_1_scorer = make_scorer(score_func=js_1, greater_is_better=True)
js_01_scorer = make_scorer(score_func=js_01, greater_is_better=True)
js_001_scorer = make_scorer(score_func=js_001, greater_is_better=True)
f1_0_scorer = make_scorer(score_func=f1_0, greater_is_better=True)
f1_1_scorer = make_scorer(score_func=f1_1, greater_is_better=True)
f1_01_scorer = make_scorer(score_func=f1_01, greater_is_better=True)
f1_001_scorer = make_scorer(score_func=f1_001, greater_is_better=True)