# Create dataset

In [2]:
import PyPDF2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def extract_document_outline(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        
        # Get the document outline
        document_outline = pdf_reader.outline
        
        # Initialize lists to store levels, titles, page numbers, and parent titles
        levels = []
        titles = []
        page_numbers = []
        parent_titles = []
        
        # Define a recursive function to traverse the document outline
        def traverse_outline(outline_items, level=0, parent_title=None):
            for item in outline_items:
                if isinstance(item, list):
                    # Handle nested outlines
                    traverse_outline(item, level + 1, parent_title)
                elif isinstance(item, dict):
                    # Extract title and page number
                    title = item.get('/Title')
                    page_number = item.get('/Page')
                    
                    if title and page_number:
                        # Append to the lists
                        levels.append(level)
                        titles.append(title)
                        page_numbers.append(pdf_reader.get_page_number(page_number) + 1)  # Adjust index
                        parent_titles.append(parent_title)
                    
                    # Update parent title for child items
                    parent_title = title
                    
        # Start traversing the document outline
        traverse_outline(document_outline)
        
        # Create a DataFrame from the extracted data
        df = pd.DataFrame({
            'Level': levels,
            'Title': titles,
            'Page Number': page_numbers,
            'Parent Title': parent_titles
        })
        
        return df

In [4]:
def extract_text_from_page(pdf_path, page_numbers):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PdfFileReader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Initialize list to store text
        all_text = []
        
        # Extract text from specified page range
        for page_num in page_numbers:
            page = pdf_reader.pages[page_num - 1]  # Adjust index
            text = page.extract_text()
            all_text.append(text)
        
        return all_text

In [5]:
def add_description(df, df2, pdf_path):
    # Extract page numbers from the DataFrame
    page_numbers = df['Page Number'].tolist()
    
    # Extract text from each page
    all_text = extract_text_from_page(pdf_path, page_numbers)
    train_text = []
    test_text = []
    for i in range(len(all_text)):
        total_length = len(all_text[i])
        train_text_entry = all_text[i][:int(0.8*total_length)]
        test_text_entry = all_text[i][int(0.8*total_length):]
        train_text.append(train_text_entry)
        test_text.append(test_text_entry)
    # Add description column to the DataFrame
    df['Description'] = train_text
    df2['Description'] = test_text
    
    return df, df2

In [7]:
book_path = '../dataset/pdf/network_1.pdf'

In [8]:
document_outline_df = extract_document_outline(book_path)
document_outline_df_2 = document_outline_df.copy()

In [9]:
document_outline_df_2

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Computer Networking,2,
1,0,Brief Contents,3,Computer Networking
2,0,Table of Contents,5,Brief Contents
3,0,1. Computer Networks and the Internet,12,Table of Contents
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet
...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions
243,1,IPsec Lab,699,Wireshark Lab: SSL
244,0,References,702,8. Security in Computer Networks


In [10]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title
0,0,Computer Networking,2,
1,0,Brief Contents,3,Computer Networking
2,0,Table of Contents,5,Brief Contents
3,0,1. Computer Networks and the Internet,12,Table of Contents
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet
...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions
243,1,IPsec Lab,699,Wireshark Lab: SSL
244,0,References,702,8. Security in Computer Networks


In [11]:
document_outline_df, document_outline_df_2 = add_description(document_outline_df, document_outline_df_2, book_path)

In [12]:
document_outline_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Computer Networking,2,,"James F. Kurose\nUniversity of Massachusetts, ..."
1,0,Brief Contents,3,Computer Networking,xix1\n81\n181\n303377449531Chapter 1 Computer ...
2,0,Table of Contents,5,Brief Contents,Chapter 1 Computer Networks and the Internet 1...
3,0,1. Computer Networks and the Internet,12,Table of Contents,11\nToday’s Internet is arguably the largest e...
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet,2 CHAPTER 1 • COMPUTER NETWORKS AND THE...
...,...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary,680 CHAPTER 8 • SECURITY IN COMPUTER NE...
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions,688 CHAPTER 8 • SECURITY IN COMPUTER NE...
243,1,IPsec Lab,699,Wireshark Lab: SSL,688 CHAPTER 8 • SECURITY IN COMPUTER NE...
244,0,References,702,8. Security in Computer Networks,691References\nA note on URLs. In the referenc...


In [13]:
document_outline_df_2

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Computer Networking,2,,WORKING\nA Top-Down Approach
1,0,Brief Contents,3,Computer Networking,rences 691\nIndex 731Brief Contents\nA01_KURO1...
2,0,Table of Contents,5,Brief Contents,Internet Explosion: The 1990s 62\n1.7.5 The Ne...
3,0,1. Computer Networks and the Internet,12,Table of Contents,"terminology and concepts, we’ll first examine..."
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet,"onsoles, thermostats, home security systems, h..."
...,...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary,ppose you want to encrypt the message 10101111...
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions,"net adapters. In the second half of the lab, y..."
243,1,IPsec Lab,699,Wireshark Lab: SSL,"net adapters. In the second half of the lab, y..."
244,0,References,702,8. Security in Computer Networks,"amson, “The Aloha System—Another Alternative f..."


In [14]:
document_outline_df.to_csv('document_outline.csv', index=False)

In [15]:
document_outline_df_2.to_csv('document_outline_2.csv', index=False)

## Data Preprocessing:

In [16]:
from pathlib import Path
import pandas as pd
train_path = Path("./document_outline.csv")
valid_path = Path("./document_outline_2.csv")
train_df = pd.read_csv(train_path)
valid_df = pd.read_csv(valid_path)

In [17]:
import regex

def preprocess_df(input_df, X, Y):
    df = input_df.copy()
    
    df['Description'] = input_df[X].apply(lambda row: ' '.join(row.map(str)), axis=1)
    
    pattern = regex.compile(r'\W+', regex.UNICODE)
    df['Description'] = df['Description'].apply(lambda x: pattern.sub(' ', str(x)))

    df['cat1'] = ''
    df['cat2'] = ''
    df['cat3'] = ''
    
    for index, row in df.iterrows():
        if row['Level'] == 0:
            df.at[index, 'cat1'] = row['Title']
        elif row['Level'] == 1:
            parent_title = df.loc[df.index < index, 'Title'].iloc[-1] 
            df.at[index, 'cat1'] = parent_title
            df.at[index, 'cat2'] = row['Title']
        elif row['Level'] == 2:
            parent_title_level1 = df.loc[df.index < index, 'Title'].iloc[-2] 
            parent_title_level2 = df.loc[df.index < index, 'Title'].iloc[-1]  
            df.at[index, 'cat1'] = parent_title_level1
            df.at[index, 'cat2'] = parent_title_level2
            df.at[index, 'cat3'] = row['Title']
    
    return df[['Description', 'cat1', 'cat2', 'cat3']]


In [18]:
train_df.head()

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Computer Networking,2,,"James F. Kurose\nUniversity of Massachusetts, ..."
1,0,Brief Contents,3,Computer Networking,xix1\n81\n181\n303377449531Chapter 1 Computer ...
2,0,Table of Contents,5,Brief Contents,Chapter 1 Computer Networks and the Internet 1...
3,0,1. Computer Networks and the Internet,12,Table of Contents,11\nToday’s Internet is arguably the largest e...
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet,2 CHAPTER 1 • COMPUTER NETWORKS AND THE...


In [19]:
X = ["Description"]
Y = ["Title", "Parent Title", "Level"]
preprocessed_train_df = preprocess_df(train_df, X, Y)
preprocessed_train_df.to_csv("preprocessed_train.csv")
preprocessed_valid_df = preprocess_df(valid_df, X, Y)
preprocessed_valid_df.to_csv("preprocessed_valid.csv")

In [20]:
train_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Computer Networking,2,,"James F. Kurose\nUniversity of Massachusetts, ..."
1,0,Brief Contents,3,Computer Networking,xix1\n81\n181\n303377449531Chapter 1 Computer ...
2,0,Table of Contents,5,Brief Contents,Chapter 1 Computer Networks and the Internet 1...
3,0,1. Computer Networks and the Internet,12,Table of Contents,11\nToday’s Internet is arguably the largest e...
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet,2 CHAPTER 1 • COMPUTER NETWORKS AND THE...
...,...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary,680 CHAPTER 8 • SECURITY IN COMPUTER NE...
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions,688 CHAPTER 8 • SECURITY IN COMPUTER NE...
243,1,IPsec Lab,699,Wireshark Lab: SSL,688 CHAPTER 8 • SECURITY IN COMPUTER NE...
244,0,References,702,8. Security in Computer Networks,691References\nA note on URLs. In the referenc...


In [21]:
valid_df

Unnamed: 0,Level,Title,Page Number,Parent Title,Description
0,0,Computer Networking,2,,WORKING\nA Top-Down Approach
1,0,Brief Contents,3,Computer Networking,rences 691\nIndex 731Brief Contents\nA01_KURO1...
2,0,Table of Contents,5,Brief Contents,Internet Explosion: The 1990s 62\n1.7.5 The Ne...
3,0,1. Computer Networks and the Internet,12,Table of Contents,"terminology and concepts, we’ll first examine..."
4,1,1.1 What Is the Internet?,13,1. Computer Networks and the Internet,"onsoles, thermostats, home security systems, h..."
...,...,...,...,...,...
241,1,Homework Problems and Questions,691,8.10 Summary,ppose you want to encrypt the message 10101111...
242,1,Wireshark Lab: SSL,699,Homework Problems and Questions,"net adapters. In the second half of the lab, y..."
243,1,IPsec Lab,699,Wireshark Lab: SSL,"net adapters. In the second half of the lab, y..."
244,0,References,702,8. Security in Computer Networks,"amson, “The Aloha System—Another Alternative f..."


In [22]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

language_stop_words = stopwords.words('english')

vectorizer = TfidfVectorizer()
vectorizer.fit(preprocessed_train_df['Description'].apply(lambda x: np.str_(x)))
X_train = vectorizer.transform(preprocessed_train_df['Description'].apply(lambda x: np.str_(x)))

X_valid = vectorizer.transform(preprocessed_valid_df['Description'].apply(lambda x: np.str_(x)))


oe = OrdinalEncoder()
Y_columns = ['cat1', 'cat2', 'cat3']
oe.fit(preprocessed_train_df[Y_columns].values)
Y_train = oe.transform(preprocessed_train_df[Y_columns].values)
Y_valid = oe.transform(preprocessed_valid_df[Y_columns].values)

print('X training shape', X_train.shape, X_train.dtype)
print('Y training shape', Y_train.shape, Y_train.dtype)
print('X Valid shape', X_valid.shape, X_valid.dtype)
print('Y Valid shape', Y_train.shape, Y_valid.dtype)

X training shape (246, 6352) float64
Y training shape (246, 3) float64
X Valid shape (246, 6352) float64
Y Valid shape (246, 3) float64


## Analyzing dataset

In [91]:
import matplotlib.pyplot as plt
import seaborn as sns

### Heat Map:
**use**:  visualizing the correlation between the categories (cat1, cat2, cat3).

## Classification

In [23]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import SGDClassifier


clf=ClassifierChain(SGDClassifier(random_state=0, class_weight='balanced', n_jobs=-1))
clf.fit(X_train, Y_train)

## Evaluation

In [24]:
from sklearn.metrics import jaccard_score, f1_score, make_scorer

def concat_categories_level2(Y):
    return np.apply_along_axis(lambda a: str(a[0]) + '-' + str(a[1]), 1, Y)

def concat_categories_level3(Y):
    return np.apply_along_axis(lambda a: str(int(a[0])) + '-' + str(int(a[1])) + '-' + str(int(a[2])), 1, Y)

def js_0(y, y_pred, **kwargs):
    return jaccard_score(y[:,0], y_pred[:,0], average='micro')

def js_1(y, y_pred, **kwargs):
    return jaccard_score(y[:,1], y_pred[:,1], average='micro')

def f1_0(y, y_pred, **kwargs):
    return f1_score(y[:,0], y_pred[:,0], average='micro')

def f1_1(y, y_pred, **kwargs):
    return f1_score(y[:,1], y_pred[:,1], average='micro')

# Score for predicting 'category_1-category_2' (concatenated strings)
def js_01(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def f1_01(y, y_pred, **kwargs):
    return f1_score(concat_categories_level2(y), concat_categories_level2(y_pred), average='micro')

def js_001(y, y_pred, **kwargs):
    return jaccard_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

def f1_001(y, y_pred, **kwargs):
    return f1_score(concat_categories_level3(y), concat_categories_level3(y_pred), average='micro')

js_0_scorer = make_scorer(score_func=js_0, greater_is_better=True)
js_1_scorer = make_scorer(score_func=js_1, greater_is_better=True)
js_01_scorer = make_scorer(score_func=js_01, greater_is_better=True)
js_001_scorer = make_scorer(score_func=js_001, greater_is_better=True)
f1_0_scorer = make_scorer(score_func=f1_0, greater_is_better=True)
f1_1_scorer = make_scorer(score_func=f1_1, greater_is_better=True)
f1_01_scorer = make_scorer(score_func=f1_01, greater_is_better=True)
f1_001_scorer = make_scorer(score_func=f1_001, greater_is_better=True)

In [25]:
Y_pred = clf.predict(X_valid)

In [26]:
print('For both Level 1 and Level 2  concatenated:\n\tF1 micro (=accuracy): {}'.format(f1_01(Y_valid,Y_pred).round(5)))

For both Level 1 and Level 2  concatenated:
	F1 micro (=accuracy): 0.0


In [27]:
print('Just the Level 1:\n\tF1 micro (=accuracy): {}'.format(f1_0(Y_valid,Y_pred).round(3)))

Just the Level 1:
	F1 micro (=accuracy): 0.358


In [28]:
print('Just the Level 2:\n\tF1 micro (=accuracy): {}'.format(f1_1(Y_valid,Y_pred).round(3)))

Just the Level 2:
	F1 micro (=accuracy): 0.004


In [29]:
print('Level 1 and 2 and 3 concatenated:\n\tF1 micro (=accuracy): {}'.format(f1_001(Y_valid,Y_pred).round(3)))

Level 1 and 2 and 3 concatenated:
	F1 micro (=accuracy): 0.0


In [30]:
print('For both Level 1 and Level 2  concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_01(Y_valid,Y_pred).round(5)))

For both Level 1 and Level 2  concatenated:
	Jaccard score micro (=accuracy): 0.0


In [31]:
print('For just Level 1:\n\tJaccard score micro (=accuracy): {}'.format(js_0(Y_valid,Y_pred).round(5)))

For just Level 1:
	Jaccard score micro (=accuracy): 0.21782


In [32]:
print('For just Level 2:\n\tJaccard score micro (=accuracy): {}'.format(js_1(Y_valid,Y_pred).round(5)))

For just Level 2:
	Jaccard score micro (=accuracy): 0.00204


In [33]:
print('Level 1 and 2 concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_01(Y_valid,Y_pred).round(5)))

Level 1 and 2 concatenated:
	Jaccard score micro (=accuracy): 0.0


In [35]:
print('For Levels 1 and 2 and 3 concatenated:\n\tJaccard score micro (=accuracy): {}'.format(js_001(Y_valid,Y_pred).round(5)))

For Levels 1 and 2 and 3 concatenated:
	Jaccard score micro (=accuracy): 0.0
