In [None]:
%pip install PyPDF2 nltk pandas
!pip install pdfminer.six
!pip install enchant

In [2]:
import PyPDF2
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import enchant
import pdfplumber
from sklearn.model_selection import train_test_split

#nltk.download('words') -> do that only once
#nltk.download('stopwords') -> do that only once
#nltk.download("punkt") -> do that only once

In [3]:
def find_contents_page(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)

        start_page = 22
        end_page = 28
        end_page = min(end_page, len(pdf_reader.pages))
        contents_page = ""
        for page_num in range(start_page, end_page):
            page_text = pdf_reader.pages[page_num].extract_text()
            lines = page_text.split('\n')
            page_text = ' '.join(lines)
            contents_page += page_text
        if not contents_page:
            return "Specified pages not found in the PDF"

        return contents_page


In [4]:
def create_csv(sections):
    data = []
    for match in sections:
        chapter = match[0]
        title = match[1]
        page = match[2]

        data.append({"Chapter": chapter, "Title": title, "Page": page})
    df = pd.concat([pd.DataFrame([section]) for section in data], ignore_index=True)

    return df

In [5]:
def extract_sections(contents_text):
    tokens = nltk.word_tokenize(contents_text)
    matches = re.findall(r"(\d+\.\d+)\s+(.*?)\s+(\d+)", contents_text)

    return matches

In [6]:
# Replace 'OS_Main book.pdf' with the path to your PDF file
pdf_path = "Books/OS_Main book.pdf"
contents_page_text = find_contents_page(pdf_path)

In [7]:
pdf_reader = PyPDF2.PdfReader(pdf_path)
sections = extract_sections(contents_page_text)

In [8]:
sections

In [9]:
def add_text_to_content_OLD(content_list, pdf_reader): #NOT WORKING
    new_content_list = []
    for i, (chapter_num, chapter_name, chapter_page) in enumerate(content_list):
        content_text = ""
        for page_num in range(int(chapter_page), int(chapter_page) + 50):
            page_text = pdf_reader.pages[page_num].extract_text()
            text = page_text[:50].lower()
            print(text)
            if ("exercises" in text) and ("chapter" in text):
                content_text += " " + page_text
                break
        new_content_list.append((chapter_num, chapter_name, chapter_page, content_text))
        
    return new_content_list

In [10]:
def add_text_to_content(content_list, pdf_reader): #NEW
    new_content_list = []
    upcoming_chapter = 2
    buffer = ""
    for page_number in range(29, len(pdf_reader.pages)):
        page_content = pdf_reader.pages[page_number].extract_text()
        significant_part = page_content[:50].lower()
        buffer += " " + page_content
        if f"chapter{upcoming_chapter}" in significant_part or f"chapter {upcoming_chapter} " in significant_part:
            buffer += content_list[str(upcoming_chapter-1)]
            new_content_list.append((upcoming_chapter-1, buffer, page_number))
            buffer = ""
            upcoming_chapter += 1
        elif (upcoming_chapter == 22 and 
                 f"appendices" in significant_part):
            buffer += content_list['21']
            new_content_list.append((upcoming_chapter-1, buffer, page_number))
            buffer = ""
            break
    return new_content_list

In [11]:
def pre_process_string(string_X):
    processed_string = re.sub(r'\W', ' ', str(string_X))
    processed_string = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_string)
    processed_string = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_string)
    processed_string = re.sub(r'\s+', ' ', processed_string, flags=re.I)
    processed_string = re.sub(r'^b\s+', ' ', processed_string)
    processed_string = re.sub(r'[A-Z]+[a-z]+\d+(\.\d+)?', "", processed_string)
    processed_string = re.sub(r'\d+(\.\d+)?', "", processed_string)
    processed_string = re.sub(r'\s+', ' ', processed_string, flags=re.I)
    processed_string = re.findall(r'[A-Z]+[a-z]*|[a-z]+', processed_string)
    processed_string = ' '.join(processed_string)
    return processed_string.strip()

In [12]:
def predict_label(input_string, model):
    segment_size = 300
    input_segments = split_text(input_string, segment_size)
    input_features = vectorizer.transform(input_segments).toarray()
    predicted_labels = model.predict(input_features)
    return predicted_labels

In [13]:
def get_correct_and_wrong_parts(sections):
    wrong_contents = []
    normal_contents = []
    prev_index, prev_index_dec = None, None
    prev_sec = None
    for section in sections:
        current_index, current_index_dec = section[0].split(".")
        if ((prev_index is not None) and 
            (int(current_index) == int(prev_index)) and 
            (int(current_index_dec) != int(prev_index_dec) + 1)):
            wrong_contents.append(prev_sec)
        else:
            if prev_sec is not None:
                normal_contents.append(prev_sec)

        prev_sec = section
        prev_index = current_index
        prev_index_dec = current_index_dec
    return normal_contents, wrong_contents

In [14]:
normal_contents, wrong_contents = get_correct_and_wrong_parts(sections)

In [15]:
wrong_contents 

In [16]:
def get_flattened_content(normal_contents):
    result = {}
    for chapter in normal_contents:
        main = chapter[0].split(".")[0]
        if main not in result.keys():
            result[main] = chapter[1]
        else:
            result[main] += " " + chapter[1]
    return result

In [17]:
flattened_contents = get_flattened_content(normal_contents)

In [18]:
flattened_contents

In [19]:
contents_page_with_text = add_text_to_content(flattened_contents, pdf_reader)

In [20]:
dictionary = {"chapter":[], "words":[]}

In [21]:
for section in contents_page_with_text:
    dictionary["chapter"].append(section[0])
    dictionary["words"].append(re.sub(r'\b\w{20,}\b', '', pre_process_string(section[1]).replace(" the", " ").lower()))

In [22]:
df = pd.DataFrame(dictionary)

In [23]:
df["words_number"] = df["words"].apply(lambda x: len(x.split()))

In [24]:
df

### Data Augmentation

In [25]:
def split_text(text, segment_size):
    words = text.split()
    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]
    return [' '.join(segment) for segment in segments]

In [26]:
def get_augmented_data(df, segment_size):
    augmented_data = {"chapter": [], "words": []}
    for index, row in df.iterrows():
        chapter = row["chapter"]
        words = row["words"]
        text_segments = split_text(words, segment_size)
        for segment in text_segments:
            augmented_data["chapter"].append(chapter)
            augmented_data["words"].append(segment)

    return pd.DataFrame(augmented_data).sample(frac=1)

In [27]:
augmented_df = get_augmented_data(df, 300)

In [28]:
import matplotlib.pyplot as plt
chapter_counts = augmented_df["chapter"].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(chapter_counts, labels=chapter_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Chapters')
plt.show()

In [29]:
vectorizer = TfidfVectorizer(max_features=5000, 
                             min_df=0.05, 
                             max_df=0.85, 
                             stop_words=stopwords.words('english'))
augmented_features = vectorizer.fit_transform(augmented_df["words"]).toarray()

### Machine Learning Algorithms

In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [31]:
comparison = {}

In [32]:
def get_statistics(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", classification_rep)

    plt.figure(figsize=(10, 6))
    pd.Series(y_pred).value_counts().sort_index().plot(kind='bar', color='darkred')
    plt.xlabel('Chapter')
    plt.ylabel('Count')
    plt.title('Distribution of Predicted Chapters')
    plt.xticks(rotation=45)
    plt.show()

In [33]:
X_train, X_test, y_train, y_test = train_test_split(augmented_features, augmented_df["chapter"], test_size=0.2, 
                                                    random_state=0, stratify=augmented_df["chapter"])

#### Neural Network 

In [34]:
mlp = MLPClassifier(max_iter=10000, activation="tanh", hidden_layer_sizes=(56))
mlp.fit(X_train, y_train)
get_statistics(mlp, X_test, y_test)
comparison["MLP"] = mlp.score(X_test, y_test)

#### SVC

In [36]:
svc_model = SVC(kernel='rbf', C=1.0, gamma=0.9)
svc_model.fit(X_train, y_train)
get_statistics(svc_model, X_test, y_test)
comparison["SVC"] = svc_model.score(X_test, y_test)

#### Random Forest

In [37]:
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_classifier.fit(X_train, y_train)
get_statistics(rf_classifier, X_test, y_test)
comparison["Random Forest"] = rf_classifier.score(X_test, y_test)

#### Logistic Regression

In [38]:
logreg = LogisticRegression(max_iter=1000, multi_class="multinomial")
logreg.fit(X_train, y_train)
get_statistics(logreg, X_test, y_test)
comparison["Logistic Regression"] = logreg.score(X_test, y_test)

#### Naive Bayes 

In [39]:
nb = MultinomialNB(alpha=.5)
nb.fit(X_train, y_train)
get_statistics(nb, X_test, y_test)
comparison["Naive Bayes"] = nb.score(X_test, y_test)

#### Comparison

In [40]:
comparison

In [41]:
models = list(comparison.keys())
accuracy = list(comparison.values())

plt.figure(figsize=(10, 6))
plt.bar(models, accuracy, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.title('Comparison of ML Models')
plt.xlabel('Machine Learning Models')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  
plt.show()

#### Sample prediction 

In [50]:
input_string = """virtual machines and their relationship to contemporary operating
systems. Included is a general description of the hardware and software
techniques that make virtualization possible. This chapter provides an
overview of computer networks and distributed systems, with a focus on
the Internet and TCP/IP."""
predicted_labels = predict_label(input_string, logreg)
print("Predicted Labels:", predicted_labels)