In [2]:
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import PyPDF2
import nltk
import re
import math
import time
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from isodate import parse_duration

In [4]:
models = ["models/"+ i for i in os.listdir("models")]
vectorizers= ["vectorizers/" + i for i in os.listdir("vectorizers")]

In [5]:
BOOK_INFO = {"computer_architecture_book": ["Computer as components: Wayne Wolf", None],
              "data_intensive_book": ["Designing data intensive applications: O'REILLY", None],
              "ethic_1": ["Ethics in IT: George W.Reynolds", None],
              "ethic_2": ["Ethics for the information age: Michael J. Quinn", None],
              "os_book": ["Operating System Concepts: WILEY", None],
            "hci": ["Human Computer Interaction", None],
            "JavaScript": ["Javascript Cookbook", None],
            "network_1": ["Computer Networking", None],
            "Robot_OS": ["Robot Operating Systems", None],
            "Robotic_python": ["Robotics with Python", None]}

In [6]:
class MainProgram():
    
    _text_data = None
    _main_book = None
    _models = {}
    
    def __init__(self, text_data):
        self._text_data = text_data
        
        for i in os.listdir("models"):
            tmp = i.split("-")[0]
            self._models[tmp] = [joblib.load("models/" + i), 
                                 joblib.load("vectorizers/" + tmp + "-vectorizer.joblib")]
    
    def get_related_all_chapters(self):
        for model in self._models.keys():
            probabilities = predicted_probabilities(self._text_data, 
                                                    self._models[model][0], self._models[model][1])
            BOOK_INFO[model][1] = probabilities

In [7]:
def split_text(text, segment_size):
    words = text.split()
    segments = [words[i:i+segment_size] for i in range(0, len(words), segment_size)]
    return [' '.join(segment) for segment in segments]

In [8]:
def predicted_probabilities(input_string, model, vectorizer):
    segment_size = 300
    input_segments = split_text(input_string, segment_size)
    input_features = vectorizer.transform(input_segments).toarray()
    predicted_prob = model.predict_proba(input_features)
    result = zip(predicted_prob[0], model.classes_)
    return result

In [10]:
def find_contents_page_CAB(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            tmp = None
            if "APPENDIX" in line[:9]:
                break
            if not is_content_started:
                if "CHAPTER" in line:
                    is_content_started = True
                    number = 1
                    match = re.findall(r"CHAPTER \d\s+", line)
                    chapter_name = line.split(match[0])[1]
                    result[number] = chapter_name.strip().rstrip().lower()
            else:
                if "CHAPTER" in line:
                    number += 1
                    match = re.findall(r"CHAPTER \d\s+", line)
                    chapter_name = line.split(match[0])[1]
                    result[number] = chapter_name.strip().rstrip().lower()
                    
                else:
                    line = line.replace("-", "").replace(",", 
                                                         "").replace(":", 
                                                                     "").replace(";", 
                                                                                 "").replace("/", 
                                                                                             "").replace("&", " ")
                    pattern = re.compile(r"\d[.]\d\s+([a-z\s]+)", re.IGNORECASE)
                    match = pattern.findall(line)
                    if len(match) != 0:
                        result[number] += ", " + match[0].rstrip().strip().lower()
                
                
    return result

NameError: name 'find_contents_page_CAB' is not defined

In [11]:
def find_contents_page_DIB(pdf_reader, start_page, end_page):
    result = []
    output = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    number_two = None

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace(".", " ")
            matches = re.findall(r"\s\s\s+", line)
            if len(matches) != 0 and line.split(matches[0])[1].isnumeric():
                tmp = line.split(matches[0])
                matched_numbers = re.findall(r"\d+\s", tmp[0])
                if len(matched_numbers) != 0:
                    number = int(matched_numbers[0].rstrip())
                    number_two = 0
                    line_splitted = (f"{number}.{number_two}", tmp[0].replace(matched_numbers[0], "").lower(), tmp[1])
                else:
                    number_two += 1
                    line_splitted = (f"{number}.{number_two}", tmp[0].lower(), tmp[1])
                result.append(line_splitted)
    for i in result:
        unit = i[0].split(".")[0]
        if unit not in output.keys():
            output[unit] = i[1]
        else: output[unit] += ", " + i[1]
    return output

In [12]:
def find_contents_page_E1(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            tmp = None
            if "Glossary" in line[:9]:
                break
            if not is_content_started:
                if "Chapter" in line[:9]:
                    is_content_started = True
                    number = 1
                    match = re.findall(r"Chapter \d+\s+", line)
                    chapter_name = line.split(match[0])[1]
                    result[number] = chapter_name.strip().rstrip().lower()
            else:
                if "Chapter" in line[:9]:
                    number += 1
                    match = re.findall(r"Chapter \d+\s+", line)
                    chapter_name = line.split(match[0])[1]
                    result[number] = chapter_name.strip().rstrip().lower()
                    
                else:
                    line = line.replace("-", "").replace(",", 
                                                         "").replace(":", 
                                                                     "").replace(";", 
                                                                                 "").replace("/", 
                                                                                             "").replace("&", " ")
                    match = re.findall(r"\s+\d+", line)
                    
                    if len(match) != 0:
                        result[number] += ", " + line.split(match[0])[0].rstrip().strip().lower()
                
                
    return result

In [27]:
def find_contents_page_E2(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"(\d+)[.]*\d*[.]*\d*\s+[a-zA-Z]*", line)
            line_mod = line.replace(".", "").replace("’", " ").replace("?", "").replace("“", "").replace("”", "")
            match_2 = re.findall(r"\d+[.]*\d*[.]*\d*([a-zA-Z|\s]+)\d+", line_mod)
            
            
            if len(match) != 0:
                if match[0] in result.keys():
                    result[match[0]] += ", " + match_2[0].rstrip().strip().lower()
                else:
                    result[match[0]] = match_2[0].rstrip().strip().lower()
                 
    return result

In [5]:
pdf = PyPDF2.PdfReader("Books/data_intensive.pdf")
find_contents_page_DIB(pdf, 6, 12)

{'1': 'reliable, scalable and maintainable applications, thinking about data systems, reliability, hardware faults, software errors, human errors, how important is reliability?, scalability, describing load, describing performance, approaches for coping with load, maintainability, operability: making life easy for operations, simplicity: managing complexity, evolvability: making change easy, summary',
 '2': 'data models and query languages, relational model vs  document model, the birth of nosql, the object-relational mismatch, many-to-one and many-to-many relationships, are document databases repeating history?, relational vs  document databases today, query languages for data, declarative queries on the web, mapreduce querying, graph-like data models, property graphs, the cypher query language, graph queries in sql, triple-stores and sparql, the foundation: datalog, summary',
 '3': 'storage and retrieval, data structures that power your database, hash indexes, sstables and lsm-trees,

In [14]:
def find_contents_page_OS(pdf_reader):
    result = {}
    start_page = 22
    end_page = 28
    end_page = min(end_page, len(pdf_reader.pages))
    contents_page = ""
    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        lines = page_text.split('\n')
        page_text = ' '.join(lines)
        contents_page += page_text
    if not contents_page:
        return "Specified pages not found in the PDF"

    matches = re.findall(r"(\d+\.\d+)\s+(.*?)\s+(\d+)", contents_page)
    for i in matches:
        unit = i[0].split(".")[0]
        if unit not in result.keys():
            result[unit] = i[1]
        else: result[unit] += ", " + i[1]
    return result
def find_contents_page_HCI(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"(\d+)[.]*\d*[.]*\d*\s+[a-zA-Z]*", line)
            line_mod = line.replace(".", "").replace("’", 
                                                     " ").replace("?", 
                                                                  "").replace("“", 
                                                                              "").replace("”", 
                                                                                          "").replace("#", 
                                                                                                      "").replace("!", "")
            match_2 = re.findall(r"\d+[.]*\d*[.]*\d*([a-zA-Z|\s]+)", line_mod)
            
            
            if len(match) != 0:
                if match[0] in result.keys():
                    result[match[0]] += ", " + match_2[0].rstrip().strip().lower()
                else:
                    result[match[0]] = match_2[0].rstrip().strip().lower()
                 
    return result

In [15]:
def find_contents_page_JS(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"Chapter\s*(\d+)", line)
            line_mod = line.replace(".", "").replace("’", 
                                                     " ").replace("?", 
                                                                  "").replace("“", 
                                                                              "").replace("”", 
                                                                                          "").replace("#", 
                                                                                                      "").replace("!", "")
            match_2 = re.findall(r"\d+\s+([a-zA-Z|\s]+)", line_mod)
            match_3 = re.findall(r"\s*([a-zA-Z|\s]+)", line_mod)

            if len(match) != 0:
                number = match[0]
                result[number] = match_2[0].rstrip().strip().lower()
            else:
                if len(match_2) == 0 and len(match) == 0 and len(match_3) != 0 and number is not None:
                    result[number] +=  ", " + match_3[0].rstrip().strip().lower()
                 
    return result

In [16]:
def find_contents_page_N1(pdf_reader, start_page, end_page):
    result = {}
    result_2 = []
    
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            
            match = re.findall(r"Chapter\s+(\d+)\s+[a-zA-Z|\s]+", line)
            match_subsections = re.findall(r"(\d+)[.]\d+[.]*\d*\s+[a-zA-Z|\s]+", line)
            
            line_mod = line.replace("’", " ").replace("?", "").replace("“", "").replace("”", "")
            
            result_2.append(line_mod.lower())
            
            line_mod = line_mod.replace(".", " ")
            
            match_2 = re.findall(r"Chapter\s+\d+\s+([a-zA-Z|\s]+)", line_mod)
            match_subsections_2 = re.findall(r"\d+\s\d+\s+\d*\s*([a-zA-Z|\s]+)", line_mod)
            if len(match) != 0:
                result[match[0]] = match_2[0].rstrip().strip().lower()
                number = match[0]
            elif len(match_subsections_2) != 0 and number is not None:
                result[number] += ", " + match_subsections_2[0].rstrip().strip().lower()
                 
    return result

In [17]:
def find_contents_page_RO(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"Chapter\s+(\d+)\s+[a-zA-Z|\s]+", line)
            line_mod = line.replace(".", " ").replace("’", " ").replace("?", "").replace("“", "").replace("”", "")
            match_2 = re.findall(r"Chapter\s+\d+\s+([a-zA-Z|\s]+)", line_mod)
            match_subsections = re.findall(r"\s*([a-zA-Z|\s]+)", line)
            if len(match) != 0:
                result[match[0]] = match_2[0].rstrip().strip().lower()
                number = match[0]
            elif len(match_subsections) != 0 and number is not None:
                result[number] += ", " + match_subsections[0].rstrip().strip().lower()
                 
    return result

In [18]:
def find_contents_page_RP(pdf_reader, start_page, end_page):
    result = {}
    end_page = min(end_page, len(pdf_reader.pages))
    number = None
    is_content_started = False

    for page_num in range(start_page, end_page):
        page_text = pdf_reader.pages[page_num].extract_text()
        for line in page_text.split("\n"):
            line = line.replace("-", " ").replace(",", 
                                                "").replace(":", 
                                                        "").replace(";", 
                                                                "").replace("/", 
                                                                        " ").replace("&", 
                                                                                " ").replace('"',
                                                                                        "").replace("'",
                                                                                                   "").replace("(",
                                                                                                               "").replace(")", "")
            match = re.findall(r"Chapter\s+(\d+)\s+[a-zA-Z|\s]+", line)
            line_mod = line.replace(".", " ").replace("’", " ").replace("?", "").replace("“", "").replace("”", "")
            match_2 = re.findall(r"Chapter\s+\d+\s+([a-zA-Z|\s]+)", line_mod)
            match_subsections = re.findall(r"\s*([a-zA-Z|\s]+)", line)
            if len(match) != 0:
                result[match[0]] = match_2[0].rstrip().strip().lower()
                number = match[0]
            elif len(match_subsections) != 0 and number is not None:
                result[number] += ", " + match_subsections[0].rstrip().strip().lower()
                 
    return result

In [19]:
def get_unit(book, chapter):
    result = ""
    flattened_contents = None
    
    if book == "computer_architecture_book":
        pdf = PyPDF2.PdfReader("Books/computer_architecture.pdf")
        flattened_contents = find_contents_page_CAB(pdf, 9, 17)
    elif book == "data_intensive_book":
        pdf = PyPDF2.PdfReader("Books/data_intensive.pdf")
        flattened_contents = find_contents_page_DIB(pdf, 6, 12)
    elif book == "ethic_1":
        pdf = PyPDF2.PdfReader("Books/ethics_1.pdf")
        flattened_contents = find_contents_page_E1(pdf, 8, 14)
    elif book == "ethic_2":
        pdf = PyPDF2.PdfReader("Books/ethics_2.pdf")
        flattened_contents = find_contents_page_E2(pdf, 6, 28)
    elif book == "os_book":
        pdf = PyPDF2.PdfReader("Books/os.pdf")
        flattened_contents = find_contents_page_OS(pdf)
    elif book == "hci":
        pdf = PyPDF2.PdfReader("Books/HCI.pdf")
        flattened_contents = find_contents_page_HCI(pdf, 5, 10)
    elif book == "JavaScript":
        pdf = PyPDF2.PdfReader("Books/JavaScript.pdf")
        flattened_contents = find_contents_page_JS(pdf, 4, 14)
    elif book == "network_1":
        pdf = PyPDF2.PdfReader("Books/network_1.pdf")
        flattened_contents = find_contents_page_N1(pdf, 4, 11)
    elif book == "Robot_OS":
        pdf = PyPDF2.PdfReader("Books/Robot_OS.pdf")
        flattened_contents = find_contents_page_RO(pdf, 4, 9)
    elif book == "Robotic_python":
        pdf = PyPDF2.PdfReader("Books/Robotics_python.pdf")
        flattened_contents = find_contents_page_RP(pdf, 4, 9)
    
    normalized_dict = {str(key): value for key, value in flattened_contents.items()}
    return normalized_dict[str(chapter)]

In [20]:
def get_hours(x):
    output = 1
    if x >= 0.5:
        output = (48 * x**2) + 2
    elif x >= 0.4:
        output = (28 * x**2) + ((2*x)**4)
    elif x >= 0.3:
        output = (12 * x**2) + ((1.5*x)**3)
    elif x >= 0.2:
        output = (6 * x**2)
    
    return math.ceil(max(output, 1))

In [23]:
def get_key_words(content):
    WORDS_TO_EXCLUDE = ["overview", "summary", "intro", "introduction", 
                        "abstraction", "background", "start"]
    result = ""
    number = 0
    if "," in content:
        tmp = content.split(",")
        for word in tmp:
            word = word.strip().rstrip().lower()
            if not (word in WORDS_TO_EXCLUDE):
                result += word + " "
                number += 1
                if number == 2:
                    return result + "in computer science"
    else:
        tmp = content.split(" ")
        for word in tmp:
            word = word.strip().rstrip().lower()
            result += word + " "
            number += 1
            if number >= 8:
                return result + "in computer science"

In [71]:
def get_statistics_per_book():
    result = []
    for i in BOOK_INFO.keys():
        book = BOOK_INFO[i]
        for j in list(book[1]):
            if float(j[0]) > 0.30:
                unit_content = get_unit(i, j[1])
                result.append([book[0], j[0], j[1],
                               unit_content, get_hours(float(j[0]))])
    return result

In [73]:
def get_sortedBy_importance(result):
    output = []
    tmp = [i[1] for i in result]
    tmp.sort(reverse=True)
    for t in tmp:
        for j in result:
            if t == j[1]:
                output.append(j)
    if len(output) > 0:
        output[0].append(youtube_videos(get_key_words(output[0][3])))
    return output

In [102]:
def print_output(sorted_results):
    print("GENERATED SYLLABUS:")
    print("--"*45)
    n = 1
    for i in sorted_results:
        output = f"""[{i[4]} HOURS] FROM '{i[0]}', you should look chapter {i[2]}.\n
        \tChapter {i[2]} Main Subjects:\n\t\t\t{i[3]}"""
        
        if n == 1 and len(i) >= 6:
            output += f"\n\tPlease watch {i[5][0]} from the link below:\n\t\t\t {i[5][1]}"
        output += "\n\n\n"
        
        print(output)
        n += 1

In [99]:
def search_youtube_videos(api_key, query, max_results=5):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Call the search.list method to retrieve results matching the specified query term.
    search_response = youtube.search().list(
        q=query,
        part='id,snippet',
        maxResults=max_results
    ).execute()

    videos = []

    # Add each result to the list
    for search_result in search_response.get('items', []):
        if search_result['id']['kind'] == 'youtube#video':
            videos.append({
                'title': search_result['snippet']['title'],
                'video_id': search_result['id']['videoId'],
                'url': f"https://www.youtube.com/watch?v={search_result['id']['videoId']}"
            })

    return videos

In [76]:
def get_video_duration(api_key, video_id):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Call the videos.list method to retrieve information about the specified video.
    videos_response = youtube.videos().list(
        part='contentDetails',
        id=video_id
    ).execute()

    # Extract duration from the response
    duration = videos_response['items'][0]['contentDetails']['duration']

    return duration

In [77]:
def get_most_popular_videos(api_key, query):
    videos = search_youtube_videos(api_key, query)

    if not videos:
        return None

    for video in videos:
        duration = parse_duration(get_video_duration(api_key, video['video_id']))
        if duration.total_seconds() > 300:  # 20 minutes in seconds
            return video

    return None

In [105]:
def youtube_videos(query):
    output = None
    api_key = 'AIzaSyD9PTkbmrEzzdr7IcUmjbJKZqgdHldz09g' # benim API key'im ama bu restricted access sağlıyor
    most_popular_video = get_most_popular_videos(api_key, query)
    if most_popular_video:
        output = [most_popular_video['title'], most_popular_video['url']]
    time.sleep(5)
        
    return output

In [79]:
input_string = """standardized system utilized in model railroading 
to manage locomotives and accessories via a digital signal transmitted through
the tracks. Unlike traditional analog systems where locomotives receive power directly
from the tracks, it enables precise control of multiple trains independently 
on the same track without the need for separate wiring or electrical blocks. 
Each locomotive is equipped with a decoder that receives commands from a central 
controller or throttle, allowing operators to control speed, direction, lighting,
and sound functions. This systems offer enhanced realism and flexibility, enabling model
railroaders to replicate real-world train operations more accurately and create 
immersive layouts with intricate control over their trains and accessories.""" 

In [80]:
input_string_2 = """
In computing, threads enable programs to execute multiple tasks simultaneously. 
They divide the workload into smaller chunks, allowing for efficient resource allocation and parallel execution. 
This enhances performance and responsiveness, akin to a juggler effortlessly managing multiple objects at once."""

In [81]:
input_string_3 = """When considering database models, the document type model offers a higher degree of flexibility compared to relational models. Document type models excel particularly in handling tree-based data structures, where hierarchical relationships are prevalent. In this model, data is stored in a document format, such as JSON or XML, allowing for nested and varied structures within each document. This flexibility accommodates evolving data schemas and unstructured data well, making it suitable for applications with evolving data requirements or diverse data formats.
Conversely, relational database models are well-suited for managing many-to-many relationships between entities. These models organize data into tables with rows and columns, enforcing a structured schema defined by the relational schema. This structure facilitates efficient querying and data retrieval, especially when dealing with complex relationships between entities. Relational databases excel in maintaining data integrity through normalization techniques, ensuring consistency and accuracy in data storage and retrieval operations.
In summary, while document type models prioritize flexibility and adaptability, making them ideal for managing tree-based data, relational database models excel in handling many-to-many relationships and ensuring data integrity within structured environments. The choice between these models depends on the specific requirements and characteristics of the data and the application context."""

In [87]:
main = MainProgram(input_string_2)

In [88]:
main.get_related_all_chapters()

In [89]:
results = get_sortedBy_importance(get_statistics_per_book())

In [103]:
print_output(results)

GENERATED SYLLABUS:
------------------------------------------------------------------------------------------
[21 HOURS] FROM 'Operating System Concepts: WILEY', you should look chapter 4.

        	Chapter 4 Main Subjects:
			Overview, Multicore Programming, Multithreading Models, ThreadLibraries, Implicit Threading, Operating-System Examples, Summary
	Please watch Multithreading Models &amp; Hyperthreading from the link below:
			 https://www.youtube.com/watch?v=HW2Wcx-ktsc



[7 HOURS] FROM 'Computer Networking', you should look chapter 2.

        	Chapter 2 Main Subjects:
			application layer, principles of network applications, network application architectures, processes communicating, transport services available to applications, transport services provided by the internet, application layer protocols, network applications covered in this book, the web and http, overview of http, non persistent and persistent connections, http message format, user server interaction cookies, w

In [6]:
import os
import csv

In [19]:
def save_to_csv(result, book_name, start, end):
        # Ensure the "results/" directory exists
        os.makedirs("results", exist_ok=True)

        # Define the CSV file path
        csv_file_path = f"results/{book_name}.csv"

        # Write the result dictionary to a CSV file
        with open(csv_file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Key", "Value"])
            for key, value in result.items():
                writer.writerow([key, value])

        print(f"CSV file created at: {csv_file_path}")

In [28]:

pdf = PyPDF2.PdfReader("Books/computer_architecture.pdf")
flattened_contents = find_contents_page_CAB(pdf, 9, 17)
save_to_csv(flattened_contents, "computer_architecture", 9, 17)

pdf = PyPDF2.PdfReader("Books/data_intensive.pdf")
flattened_contents = find_contents_page_DIB(pdf, 6, 12)
save_to_csv(flattened_contents, "data_intensive", 6, 12)

pdf = PyPDF2.PdfReader("Books/ethics_2.pdf")
flattened_contents = find_contents_page_E1(pdf, 8, 14)
save_to_csv(flattened_contents, "ethics_2", 8, 14)

pdf = PyPDF2.PdfReader("Books/ethics_3.pdf")
flattened_contents = find_contents_page_E2(pdf, 6, 28)
save_to_csv(flattened_contents, "ethics_3", 6, 28)

pdf = PyPDF2.PdfReader("Books/os.pdf")
flattened_contents = find_contents_page_OS(pdf)
save_to_csv(flattened_contents, "os",1,2)

pdf = PyPDF2.PdfReader("Books/HCI.pdf")
flattened_contents = find_contents_page_HCI(pdf, 5, 10)
save_to_csv(flattened_contents, "HCI", 5, 10)

pdf = PyPDF2.PdfReader("Books/JavaScript.pdf")
flattened_contents = find_contents_page_JS(pdf, 4, 14)
save_to_csv(flattened_contents, "JavaScript", 4, 14)

pdf = PyPDF2.PdfReader("Books/network_1.pdf")
flattened_contents = find_contents_page_N1(pdf, 4, 11)
save_to_csv(flattened_contents, "network_1", 4, 11)

pdf = PyPDF2.PdfReader("Books/Robot_OS.pdf")
flattened_contents = find_contents_page_RO(pdf, 4, 9)
save_to_csv(flattened_contents, "Robot_OS", 4, 9)

pdf = PyPDF2.PdfReader("Books/Robotics_python.pdf")
flattened_contents = find_contents_page_RP(pdf, 4, 9)
save_to_csv(flattened_contents, "Robotics_python", 4, 9)

CSV file created at: results/computer_architecture.csv
CSV file created at: results/data_intensive.csv
CSV file created at: results/ethics_2.csv
CSV file created at: results/ethics_3.csv
CSV file created at: results/os.csv
CSV file created at: results/HCI.csv
CSV file created at: results/JavaScript.csv
CSV file created at: results/network_1.csv
CSV file created at: results/Robot_OS.csv
CSV file created at: results/Robotics_python.csv


In [24]:
pdf = PyPDF2.PdfReader("Books/ethics_2.pdf")
flattened_contents = find_contents_page_E2(pdf, 6, 28)
save_to_csv(flattened_contents, "ethics_2", 6, 28)

CSV file created at: results/ethics_2.csv
