# Create dataset

In [1]:
import PyPDF2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def extract_document_outline(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        
        # Get the document outline
        document_outline = pdf_reader.outline
        
        # Initialize lists to store levels, titles, page numbers, and parent titles
        levels = []
        titles = []
        page_numbers = []
        parent_titles = []
        
        # Define a recursive function to traverse the document outline
        def traverse_outline(outline_items, level=0, parent_title=None):
            for item in outline_items:
                if isinstance(item, list):
                    # Handle nested outlines
                    traverse_outline(item, level + 1, parent_title)
                elif isinstance(item, dict):
                    # Extract title and page number
                    title = item.get('/Title')
                    page_number = item.get('/Page')
                    
                    if title and page_number:
                        # Append to the lists
                        levels.append(level)
                        titles.append(title)
                        page_numbers.append(pdf_reader.get_page_number(page_number) + 1)  # Adjust index
                        parent_titles.append(parent_title)
                    
                    # Update parent title for child items
                    parent_title = title
                    
        # Start traversing the document outline
        traverse_outline(document_outline)
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame({
            'Level': levels,
            'Title': titles,
            'Page Number': page_numbers,
            'Parent Title': parent_titles
        })
        
        return df

In [3]:
def extract_text_from_page(pdf_path, page_numbers):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize list to store text
        all_text = []
        
        # Extract text from specified page range
        for page_num in page_numbers:
            page = pdf_reader.pages[page_num - 1]  # Adjust index
            text = page.extract_text()
            all_text.append(text)
        
        return all_text

In [4]:
def add_description(df, df2, pdf_path):
    # Extract page numbers from the DataFrame
    page_numbers = df['Page Number'].tolist()
    
    # Extract text from each page
    all_text = extract_text_from_page(pdf_path, page_numbers)
    train_text = []
    test_text = []
    for i in range(len(all_text)):
        total_length = len(all_text[i])
        train_text_entry = all_text[i][:int(0.8*total_length)]
        test_text_entry = all_text[i][int(0.8*total_length):]
        train_text.append(train_text_entry)
        test_text.append(test_text_entry)
    # Add description column to the DataFrame
    df['Description'] = train_text
    df2['Description'] = test_text
    
    return df, df2

In [5]:
book_path = '../dataset/pdf/logic.pdf'

In [6]:
document_outline_df = extract_document_outline(book_path)
document_outline_df_2 = document_outline_df.copy()

In [7]:
document_outline_df_2

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Logic and Computer Design Fundamentals,1,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf
3,2,1-3*,2,Chapter 1
4,2,1-7*,2,1-3*
5,2,1-9*,2,1-7*
6,2,1-10*,2,1-9*
7,2,1-11*,2,1-10*
8,2,1-16*,2,1-11*
9,2,1-18*,3,1-16*


In [8]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Logic and Computer Design Fundamentals,1,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf
3,2,1-3*,2,Chapter 1
4,2,1-7*,2,1-3*
5,2,1-9*,2,1-7*
6,2,1-10*,2,1-9*
7,2,1-11*,2,1-10*
8,2,1-16*,2,1-11*
9,2,1-18*,3,1-16*


In [9]:
document_outline_df, document_outline_df_2 = add_description(document_outline_df, document_outline_df_2, book_path)

In [10]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Logic and Computer Design Fundamentals,1,,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals,1 Solutions to Problems Marked with a * in\nLo...
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf,1 Solutions to Problems Marked with a * in\nLo...
3,2,1-3*,2,Chapter 1,1 Solutions to Problems Marked with a * in\nLo...
4,2,1-7*,2,1-3*,1 Solutions to Problems Marked with a * in\nLo...
5,2,1-9*,2,1-7*,1 Solutions to Problems Marked with a * in\nLo...
6,2,1-10*,2,1-9*,1 Solutions to Problems Marked with a * in\nLo...
7,2,1-11*,2,1-10*,1 Solutions to Problems Marked with a * in\nLo...
8,2,1-16*,2,1-11*,1 Solutions to Problems Marked with a * in\nLo...
9,2,1-18*,3,1-16*,2 ...


In [11]:
document_outline_df_2

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Logic and Computer Design Fundamentals,1,,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
3,2,1-3*,2,Chapter 1,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
4,2,1-7*,2,1-3*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
5,2,1-9*,2,1-7*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
6,2,1-10*,2,1-9*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
7,2,1-11*,2,1-10*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
8,2,1-16*,2,1-11*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
9,2,1-18*,3,1-16*,10 0100 1101\nMove R 001 0010 01101\nMove R 00...


In [12]:
document_outline_df.to_csv('document_outline.csv', index=False)

In [13]:
document_outline_df_2.to_csv('document_outline_2.csv', index=False)

## Data Preprocessing:

In [14]:
from pathlib import Path
import pandas as pd
train_path = Path("./document_outline.csv")
valid_path = Path("./document_outline_2.csv")
train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)

In [15]:
import regex

def preprocess_df(input_df, X, Y):
    df = input_df.copy()
    
    df['Description'] = input_df[X].apply(lambda row: ' '.join(row.map(str)), axis=1)
    
    pattern = regex.compile(r'\W+', regex.UNICODE)
    df['Description'] = df['Description'].apply(lambda x: pattern.sub(' ', str(x)))

    df['cat1'] = ''
    df['cat2'] = ''
    df['cat3'] = ''
    
    for index, row in df.iterrows():
        if row['Level'] == 0:
            df.at[index, 'cat1'] = row['Title']
        elif row['Level'] == 1:
            parent_title = df.loc[df.index < index, 'Title'].iloc[-1] 
            df.at[index, 'cat1'] = parent_title
            df.at[index, 'cat2'] = row['Title']
        elif row['Level'] == 2:
            parent_title_level1 = df.loc[df.index < index, 'Title'].iloc[-2] 
            parent_title_level2 = df.loc[df.index < index, 'Title'].iloc[-1]  
            df.at[index, 'cat1'] = parent_title_level1
            df.at[index, 'cat2'] = parent_title_level2
            df.at[index, 'cat3'] = row['Title']
    
    return df[['Description', 'cat1', 'cat2', 'cat3']]


In [16]:
train_df.head()

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Logic and Computer Design Fundamentals,1,,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals,1 Solutions to Problems Marked with a * in\nLo...
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf,1 Solutions to Problems Marked with a * in\nLo...
3,2,1-3*,2,Chapter 1,1 Solutions to Problems Marked with a * in\nLo...
4,2,1-7*,2,1-3*,1 Solutions to Problems Marked with a * in\nLo...


In [17]:
X = ["Description"]
Y = ["Title", "Parent Title", "Level"]
preprocessed_train_df = preprocess_df(train_df, X, Y)
preprocessed_train_df.to_csv("preprocessed_train.csv")
preprocessed_valid_df = preprocess_df(valid_df, X, Y)
preprocessed_valid_df.to_csv("preprocessed_valid.csv")

In [18]:
train_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Logic and Computer Design Fundamentals,1,,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals,1 Solutions to Problems Marked with a * in\nLo...
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf,1 Solutions to Problems Marked with a * in\nLo...
3,2,1-3*,2,Chapter 1,1 Solutions to Problems Marked with a * in\nLo...
4,2,1-7*,2,1-3*,1 Solutions to Problems Marked with a * in\nLo...
5,2,1-9*,2,1-7*,1 Solutions to Problems Marked with a * in\nLo...
6,2,1-10*,2,1-9*,1 Solutions to Problems Marked with a * in\nLo...
7,2,1-11*,2,1-10*,1 Solutions to Problems Marked with a * in\nLo...
8,2,1-16*,2,1-11*,1 Solutions to Problems Marked with a * in\nLo...
9,2,1-18*,3,1-16*,2 ...


In [19]:
valid_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Logic and Computer Design Fundamentals,1,,
1,0,Logic and Computer Design Fundamentals.pdf,2,Logic and Computer Design Fundamentals,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
2,1,Chapter 1,2,Logic and Computer Design Fundamentals.pdf,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
3,2,1-3*,2,Chapter 1,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
4,2,1-7*,2,1-3*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
5,2,1-9*,2,1-7*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
6,2,1-10*,2,1-9*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
7,2,1-11*,2,1-10*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
8,2,1-16*,2,1-11*,75.5 BD.A214.625 11010110.101 326.5 D6.A\n6240...
9,2,1-18*,3,1-16*,10 0100 1101\nMove R 001 0010 01101\nMove R 00...


In [20]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

language_stop_words = stopwords.words('english')

vectorizer = TfidfVectorizer()
vectorizer.fit(preprocessed_train_df['Description'].apply(lambda x: np.str_(x)))
X_train = vectorizer.transform(preprocessed_train_df['Description'].apply(lambda x: np.str_(x)))

X_valid = vectorizer.transform(preprocessed_valid_df['Description'].apply(lambda x: np.str_(x)))


oe = OrdinalEncoder()
Y_columns = ['cat1', 'cat2', 'cat3']
oe.fit(preprocessed_train_df[Y_columns].values)
Y_train = oe.transform(preprocessed_train_df[Y_columns].values)
Y_valid = oe.transform(preprocessed_valid_df[Y_columns].values)

print('X training shape', X_train.shape, X_train.dtype)
print('Y training shape', Y_train.shape, Y_train.dtype)
print('X Valid shape', X_valid.shape, X_valid.dtype)
print('Y Valid shape', Y_train.shape, Y_valid.dtype)

X training shape (50, 715) float64
Y training shape (50, 3) float64
X Valid shape (50, 715) float64
Y Valid shape (50, 3) float64


## Analyzing dataset

In [91]:
import matplotlib.pyplot as plt
import seaborn as sns

### Heat Map:
**use**:  visualizing the correlation between the categories (cat1, cat2, cat3).

## Classification

In [21]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import SGDClassifier


clf=ClassifierChain(SGDClassifier(random_state=0, class_weight='balanced', n_jobs=-1))
clf.fit(X_train, Y_train)

## Evaluation

In [22]:
from sklearn.metrics import jaccard_score, f1_score, make_scorer

def concat_categories_level2(Y):
    return np.apply_along_axis(lambda a: str(a[0]) + '-' + str(a[1]), 1, Y)

def concat_categories_level3(Y):
    return np.apply_along_axis(lambda a: str(int(a[0])) + '-' + str(int(a[1])) + '-' + str(int(a[2])), 1, Y)

def js_0(y, y_pred, **kwargs):
    return jaccard_score(y[:,0], y_pred[:,0], average='micro')

def js_1(y, y_pred, **kwargs):
    return jaccard_score(y[:,1], y_pred[:,1], average='micro')

def f1_0(y, y_pred, **kwargs):
    return f1_score(y[:,0], y_pred[:,0], average='micro')

def f1_1(y, y_pred, **kwargs):
    return f1_score(y[:,1], y_pred[:,1], average='micro')

# Score for predicting 'category_1-category_2' (concatenated strings)
def js_01(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def f1_01(y, y_pred, **kwargs):
    return f1_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def js_001(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

def f1_001(y, y_pred, **kwargs):
    return f1_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

js_0_scorer = make_scorer(score_func=js_0, greater_is_better=True)
js_1_scorer = make_scorer(score_func=js_1, greater_is_better=True)
js_01_scorer = make_scorer(score_func=js_01, greater_is_better=True)
js_001_scorer = make_scorer(score_func=js_001, greater_is_better=True)
f1_0_scorer = make_scorer(score_func=f1_0, greater_is_better=True)
f1_1_scorer = make_scorer(score_func=f1_1, greater_is_better=True)
f1_01_scorer = make_scorer(score_func=f1_01, greater_is_better=True)
f1_001_scorer = make_scorer(score_func=f1_001, greater_is_better=True)

In [23]:
Y_pred = clf.predict(X_valid)

In [24]:
print('For both Level 1 and Level 2  concatenated:\n\tF1 micro (=accuracy): {}'.format(f1_01(Y_valid,Y_pred).round(5)))

For both Level 1 and Level 2  concatenated:
	F1 micro (=accuracy): 0.0


In [25]:
print('Just the Level 1:\n\tF1 micro (=accuracy): {}'.format(f1_0(Y_valid,Y_pred).round(3)))

Just the Level 1:
	F1 micro (=accuracy): 0.2


In [26]:
print('Just the Level 2:\n\tF1 micro (=accuracy): {}'.format(f1_1(Y_valid,Y_pred).round(3)))

Just the Level 2:
	F1 micro (=accuracy): 0.06


In [27]:
print('Level 1 and 2 and 3 concatenated:\n\tF1 micro (=accuracy): {}'.format(f1_001(Y_valid,Y_pred).round(3)))

Level 1 and 2 and 3 concatenated:
	F1 micro (=accuracy): 0.0


In [28]:
print('For both Level 1 and Level 2  concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_01(Y_valid,Y_pred).round(5)))

For both Level 1 and Level 2  concatenated:
	Jaccard score micro (=accuracy): 0.0


In [29]:
print('For just Level 1:\n\tJaccard score micro (=accuracy): {}'.format(js_0(Y_valid,Y_pred).round(5)))

For just Level 1:
	Jaccard score micro (=accuracy): 0.11111


In [30]:
print('For just Level 2:\n\tJaccard score micro (=accuracy): {}'.format(js_1(Y_valid,Y_pred).round(5)))

For just Level 2:
	Jaccard score micro (=accuracy): 0.03093


In [31]:
print('Level 1 and 2 concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_01(Y_valid,Y_pred).round(5)))

Level 1 and 2 concatenated:
	Jaccard score micro (=accuracy): 0.0


In [32]:
print('For Levels 1 and 2 and 3 concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_001(Y_valid,Y_pred).round(5)))

For Levels 1 and 2 and 3 concatenated:
	Jaccard score micro (=accuracy): 0.0
