In [34]:
import PyPDF2
import nltk
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from joblib import dump
import os
from sklearn.model_selection import GridSearchCV, cross_val_score
from transformers import DistilBertTokenizer
import tensorflow as tf

from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

#nltk.download('words') # -> do that only once
#nltk.download('stopwords') # -> do that only once
#nltk.download("punkt") # -> do that only once

In [3]:
def find_contents_page(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",",
                                                "").replace(":",
                                                        "").replace(";",
                                                                "").replace("/",
                                                                        " ").replace("&",
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"(\d+)[.]*\d*[.]*\d*\s+[a-zA-Z]*", line)
            line_mod = line.replace(".", "").replace("’",
                                                     " ").replace("?",
                                                                  "").replace("“",
                                                                              "").replace("”",
                                                                                          "").replace("#",
                                                                                                      "").replace("!", "")
            match_2 = re.findall(r"\d+[.]*\d*[.]*\d*([a-zA-Z|\s]+)", line_mod)


            if len(match) != 0:
                if match[0] in result.keys():
                    result[match[0]] += " " + match_2[0].rstrip().strip().lower()
                else:
                    result[match[0]] = match_2[0].rstrip().strip().lower()

    return result

In [4]:
pdf = PyPDF2.PdfReader("HCI.pdf")
flattened_contents = find_contents_page(pdf, 5, 10)
flattened_contents

In [8]:
# def predict_label(input_string, model):
#     segment_size = 300
#     input_segments = split_text(input_string, segment_size)
#     input_features = vectorizer.transform(input_segments).toarray()
#     predicted_labels = model.predict(input_features)
#     return predicted_labels

In [9]:
# def predicted_probabilities(input_string, model):
#     segment_size = 300
#     input_segments = split_text(input_string, segment_size)
#     input_features = vectorizer.transform(input_segments).toarray()
#     predicted_prob = model.predict_proba(input_features)
#     result = zip(predicted_prob[0], model.classes_)
#     return result

In [5]:
def add_text_to_content(content, pdf_reader): #NEW
    upcoming_chapter = None
    chapter = None
    buffer = ""
    for page_number in range(19, 339):
        page_content = pdf_reader.pages[page_number].extract_text()

        match = re.findall(r"(\d+)CHAPTER", page_content)
        match_2 = re.findall(r"319References", page_content[:20])

        if len(match) != 0:
            chapter = int(match[0])
            if chapter != 1:
                content[str(int(match[0])-1)] += buffer
            buffer = ""

        elif chapter == 8 and len(match_2) != 0:
            content["8"] += buffer
            return
        buffer += " " + page_content

In [6]:
add_text_to_content(flattened_contents, pdf)

In [12]:
#flattened_contents["8"]

In [7]:
def pre_process_string(string_X):
    processed_string = re.sub(r'\W', ' ', str(string_X))
    processed_string = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_string)
    processed_string = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_string)
    processed_string = re.sub(r'\s+', ' ', processed_string, flags=re.I)
    processed_string = re.sub(r'^b\s+', ' ', processed_string)
    processed_string = re.sub(r'[A-Z]+[a-z]+\d+(\.\d+)?', "", processed_string)
    processed_string = re.sub(r'\d+(\.\d+)?', "", processed_string)
    processed_string = re.sub(r'\s+', ' ', processed_string, flags=re.I)
    processed_string = re.findall(r'[A-Z]+[a-z]*|[a-z]+', processed_string)
    processed_string = ' '.join(processed_string)
    return processed_string.strip()

In [8]:
dictionary = {"chapter":flattened_contents.keys(), "words":[]}

In [9]:
for section in dictionary["chapter"]:
    dictionary["words"].append(re.sub(r'\b\w{20,}\b', '',
                                      pre_process_string(flattened_contents[section]).replace(" the", " ").lower()))

In [10]:
df = pd.DataFrame(dictionary)

In [11]:
df["words_number"] = df["words"].apply(lambda x: len(x.split()))

In [12]:
df

### Data Augmentation

In [13]:
def split_text(text, segment_size):
    words = text.split()
    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]
    return [' '.join(segment) for segment in segments]

In [14]:
def get_augmented_data(df, segment_size):
    augmented_data = {"chapter": [], "words": []}
    for index, row in df.iterrows():
        chapter = row["chapter"]
        words = row["words"]
        text_segments = split_text(words, segment_size)
        for segment in text_segments:
            augmented_data["chapter"].append(chapter)
            augmented_data["words"].append(segment)

    return pd.DataFrame(augmented_data).sample(frac=1)

In [15]:
augmented_df = get_augmented_data(df, 300)

In [16]:
augmented_df

In [17]:
data_texts = augmented_df['words'].to_list()
data_labels = augmented_df['chapter'].to_list()

### Machine Learning Algorithms

In [18]:
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts,
                                                                    data_labels,
                                                                    test_size = 0.2,
                                                                    random_state = 0)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts,
                                                                      train_labels,
                                                                      test_size = 0.01,
                                                                      random_state = 0)

In [19]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation = True, padding = True)
val_encodings = tokenizer(val_texts, truncation = True, padding = True)

In [43]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))


val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [22]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)

In [51]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1e-5,
    logging_dir='./logs',
    eval_steps=100,
    use_cpu=True,
    learning_rate=2e-5,  
    evaluation_strategy = "epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.batch(batch_size=16),
    eval_dataset=val_dataset.batch(batch_size=64),
    tokenizer=tokenizer,
)

In [52]:
## Making sure trainer is working:
trainer.train()
results = trainer.evaluate()
print(results)

## Finding the best model for our book

#### Neural Network

In [None]:
# param = [{"activation":["relu"],
#          "hidden_layer_sizes":[(i) for i in range(35, 48, 3)]}]
# grid = GridSearchCV(MLPClassifier(max_iter=10000), param_grid = param, cv=5, n_jobs=-1)
# grid.fit(X_train, y_train)

In [None]:
# pd.DataFrame(grid.cv_results_)

In [None]:
# mlp = MLPClassifier(max_iter=10000, activation="relu", hidden_layer_sizes=(38))
# mlp.fit(X_train, y_train)
# get_statistics(mlp, X_test, y_test)
# comparison["MLP"] = mlp.score(X_test, y_test)

#### SVC

In [None]:
param = {"kernel":["rbf"],
         "C": [1.5, 1.7, 2, 2.2, 2.5, 2.7],
         "gamma": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2]}
grid = GridSearchCV(SVC(), param_grid=param, cv=4, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
grid.best_params_

In [None]:
svc_model = SVC(kernel='rbf', C=2.5, gamma=0.4)
svc_model.fit(X_train, y_train)
get_statistics(svc_model, X_test, y_test)
comparison["SVC"] = svc_model.score(X_test, y_test)

#### Random Forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_classifier.fit(X_train, y_train)
get_statistics(rf_classifier, X_test, y_test)
comparison["Random Forest"] = rf_classifier.score(X_test, y_test)

#### Logistic Regression

In [None]:
param = {"multi_class":["multinomial"],
        "C":range(1, 70)}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param, cv=4, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
logreg = LogisticRegression(max_iter=1000, multi_class="multinomial", C=33)
logreg.fit(X_train, y_train)
get_statistics(logreg, X_test, y_test)
comparison["Logistic Regression"] = logreg.score(X_test, y_test)

#### Naive Bayes

In [None]:
param = {"alpha":[i*0.001 for i in range(1, 100)]}
grid = GridSearchCV(MultinomialNB(), param_grid=param, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
nb = MultinomialNB(alpha=0.015)
nb.fit(X_train, y_train)
get_statistics(nb, X_test, y_test)
comparison["Naive Bayes"] = nb.score(X_test, y_test)

#### Comparison

In [None]:
comparison

In [None]:
models = list(comparison.keys())
accuracy = list(comparison.values())

plt.figure(figsize=(10, 6))
plt.bar(models, accuracy, color=['blue', 'orange', 'green', 'red', 'purple'])
plt.title('Comparison of ML Models')
plt.xlabel('Machine Learning Models')
plt.ylabel('Accuracy')
plt.ylim(0.80, 1)
plt.show()

##### Logistic Regression is the best considering its less complexity and accuracy

#### Sample prediction

In [None]:
input_string = """virtual machines and their relationship to contemporary operating
systems. Included is a general description of the hardware and software
techniques that make virtualization possible. This chapter provides an
overview of computer networks and distributed systems, with a focus on
the Internet and TCP/IP."""
predicted_labels = predict_label(input_string, nb)

print("Predicted Labels:", predicted_labels)

In [None]:
for i, j in predicted_probabilities(input_string, nb):
    print(f"{i} = {j}")

In [None]:
input_string_2 = """In the realm of computer ethics, the concept of protecting original creations,
innovations, and expressions is paramount. This includes safeguarding the rights of creators and innovators to
control the use and distribution of their work. This protection extends to various forms of digital content,
software algorithms, databases, and other intangible assets. Ethical considerations arise in how individuals and
organizations respect these rights, avoid plagiarism or unauthorized use, and uphold the principles of fair compensation
and acknowledgment for intellectual contributions in the digital sphere."""
for i, j in predicted_probabilities(input_string_2, nb):
    print(f"{i} = {j}")

In [None]:
dump(logreg, 'models/hci-logistic_regression.joblib')
dump(vectorizer, 'vectorizers/hci-vectorizer.joblib')