In [1]:
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
models = ["models/"+ i for i in os.listdir("models")]
vectorizers= ["vectorizers/" + i for i in os.listdir("vectorizers")]

In [3]:
BOOK_INFO = {"computer_architecture_book": ["Computer as components: Wayne Wolf", None],
              "data_intensive_book": ["Designing data intensive applications: O'REILLY", None],
              "ethic_1": ["Ethics in IT: George W.Reynolds", None],
              "ethic_2": ["Ethics for the information age: Michael J. Quinn", None],
              "os_book": ["Operating System Concepts: WILEY", None],
            "hci": ["Human Computer Interaction", None],
            "JavaScript": ["Javascript Cookbook", None],
            "network_1": ["Computer Networking", None],
            "Robot_OS": ["Robot Operating Systems", None],
            "Robotic_python": ["Robotics with Python", None]}

In [66]:
class MainProgram():
    
    _text_data = None
    _main_book = None
    _models = {}
    
    def __init__(self, text_data):
        self._text_data = text_data
        
        for i in os.listdir("models"):
            tmp = i.split("-")[0]
            self._models[tmp] = [joblib.load("models/" + i), 
                                 joblib.load("vectorizers/" + tmp + "-vectorizer.joblib")]
    
    def get_related_all_chapters(self):
        for model in self._models.keys():
            probabilities = predicted_probabilities(self._text_data, 
                                                    self._models[model][0], self._models[model][1])
            BOOK_INFO[model][1] = probabilities

In [67]:
def split_text(text, segment_size):
    words = text.split()
    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]
    return [' '.join(segment) for segment in segments]

In [68]:
def predicted_probabilities(input_string, model, vectorizer):
    segment_size = 300
    input_segments = split_text(input_string, segment_size)
    input_features = vectorizer.transform(input_segments).toarray()
    predicted_prob = model.predict_proba(input_features)
    result = zip(predicted_prob[0], model.classes_)
    return result

In [69]:
def get_statistics_per_book():
    result = []
    for i in BOOK_INFO.keys():
        book = BOOK_INFO[i]
        for j in list(book[1]):
            if float(j[0]) > 0.50:
                result.append((book[0], j[0], j[1]))
    return result

In [70]:
def get_sortedBy_importance(result):
    output = []
    tmp = [i[1] for i in result]
    tmp.sort(reverse=True)
    for t in tmp:
        for j in result:
            if t == j[1]:
                output.append(j)
    return output

In [71]:
def print_output(result):
    sorted_results = get_sortedBy_importance(result)
    print("You should look the following books in order to learn/teach that subject:")
    print("--"*45)
    n = 1
    for i in sorted_results:
        print(f"{n}) From '{i[0]}', you should look chapter {i[2]}")
        n += 1

In [72]:
input_string = """standardized system utilized in model railroading 
to manage locomotives and accessories via a digital signal transmitted through
the tracks. Unlike traditional analog systems where locomotives receive power directly
from the tracks, it enables precise control of multiple trains independently 
on the same track without the need for separate wiring or electrical blocks. 
Each locomotive is equipped with a decoder that receives commands from a central 
controller or throttle, allowing operators to control speed, direction, lighting,
and sound functions. This systems offer enhanced realism and flexibility, enabling model
railroaders to replicate real-world train operations more accurately and create 
immersive layouts with intricate control over their trains and accessories.""" 

In [73]:
input_string_2 = """
In computing, threads enable programs to execute multiple tasks simultaneously. 
They divide the workload into smaller chunks, allowing for efficient resource allocation and parallel execution. 
This enhances performance and responsiveness, akin to a juggler effortlessly managing multiple objects at once."""

In [74]:
input_string_3 = """When considering database models, the document type model offers a higher degree of flexibility compared to relational models. Document type models excel particularly in handling tree-based data structures, where hierarchical relationships are prevalent. In this model, data is stored in a document format, such as JSON or XML, allowing for nested and varied structures within each document. This flexibility accommodates evolving data schemas and unstructured data well, making it suitable for applications with evolving data requirements or diverse data formats.
Conversely, relational database models are well-suited for managing many-to-many relationships between entities. These models organize data into tables with rows and columns, enforcing a structured schema defined by the relational schema. This structure facilitates efficient querying and data retrieval, especially when dealing with complex relationships between entities. Relational databases excel in maintaining data integrity through normalization techniques, ensuring consistency and accuracy in data storage and retrieval operations.
In summary, while document type models prioritize flexibility and adaptability, making them ideal for managing tree-based data, relational database models excel in handling many-to-many relationships and ensuring data integrity within structured environments. The choice between these models depends on the specific requirements and characteristics of the data and the application context."""

In [75]:
main = MainProgram(input_string)

In [76]:
main.get_related_all_chapters()

In [77]:
results = get_statistics_per_book()

In [78]:
results

[('Computer as components: Wayne Wolf', 0.7901076724991882, 1),
 ('Computer Networking', 0.5290759337267853, '5'),
 ('Robotics with Python', 0.662664166128289, '6')]

In [65]:
print_output(results)

You should look the following books in order to learn/teach that subject:
------------------------------------------------------------------------------------------
