In [2]:
import sys
import os

# Add the Backend directory dynamically
current_dir = os.getcwd()  # Get the current working directory
sys.path.insert(0, os.path.abspath(os.path.join(current_dir, '../..')))

# Import the required modules
from BusinessLayer.User.User import User
from BusinessLayer.User.UserFacade import *

from BusinessLayer.Util.Exceptions import *


In [2]:
invalid_mail = "david.volodarski1@gmail.com"
valid_mail = "volodavi@post.bgu.ac.il"
valid_mail1 = "puzis@bgu.ac.il"

In [9]:
user_controller = UserFacade()
len(user_controller.users_byEmail)

0

In [None]:
user_controller.register(valid_mail, "12a345D{", "david", "volodarsky")

In [22]:
import requests
from bs4 import BeautifulSoup

# URL of the main page containing the iframe
main_url =  "https://in.bgu.ac.il/engn/iem/Pages/CoursesList1.aspx"


# Fetch the main page
response = requests.get(main_url)
response.raise_for_status()
main_html = response.text

# Parse the main page
main_soup = BeautifulSoup(main_html, "html.parser")

# Find the iframe and extract its 'src' attribute
iframe = main_soup.find("iframe")
iframe_url = iframe["src"]

# Full URL of the iframe content (handle relative URLs)
iframe_url = requests.compat.urljoin(main_url, iframe_url)

# Fetch the iframe content
iframe_response = requests.get(iframe_url)
iframe_response.raise_for_status()
iframe_html = iframe_response.text

# Parse the iframe content
iframe_soup = BeautifulSoup(iframe_html, "html.parser")

# Locate the table within the iframe content
table = iframe_soup.find("table")  # Adjust this based on actual structure
print(table)
rows = table.find_all("tr") if table else []

# Extract the table data
courses = {}
for row in rows:
    cells = row.find_all("td")
    if len(cells) == 3:  # Assuming 3 columns: course_id, active_semester, course_name
        course_id = cells[0].text.strip()
        active_semester = cells[1].text.strip()
        course_name = cells[2].text.strip()
        courses[course_id] = {
            "active_semester": active_semester,
            "course_name": course_name
        }

# Print or process the extracted courses
print(courses)


None
{}


In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the iframe or webpage containing the table
url = "https://in.bgu.ac.il/engn/iem/Pages/CoursesList1.aspx"

# Fetch the HTML content
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful
html_content = response.text

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
print(soup)
# Find the rows in the table
rows = soup.find_all("tr")

# Extract data from each row
courses = {}
for row in rows:
    cells = row.find_all("td")
    if len(cells) >= 3:  # Ensure it has enough columns
        course_id = cells[0].text.strip()
        active_semester = cells[1].text.strip()
        course_name = cells[2].text.strip()
        courses[course_id] = {
            "active_semester": active_semester,
            "course_name": course_name
        }

# Print or use the extracted courses dictionary
print(courses)


In [3]:
from PyPDF2 import PdfReader
import re

def extract_syllabus_topics4(file_path, topic_patterns):
    """
    Extracts syllabus topics from a course PDF file and returns them as a set.
    Handles diverse formats such as tables, bullet points, numbered sections, and headers.

    :param file_path: Path to the PDF file
    :param topic_patterns: List of regex patterns to identify syllabus-related sections
    :return: A set of topics from the syllabus
    """
    reader = PdfReader(file_path)
    syllabus_topics = set()

    for page in reader.pages:
        text = page.extract_text()

        # Match topics using provided patterns
        for pattern in topic_patterns:
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                # Split potential topics by common delimiters and clean up
                topics = re.split(r',|;|\n|\•|\.', match)
                syllabus_topics.update([topic.strip() for topic in topics if topic.strip()])

        # Handle bullet points
        lines = text.split("\n")
        for line in lines:
            if re.match(r'^\•', line):  # Matches lines starting with "•"
                syllabus_topics.add(line.lstrip("• ").strip())

        # Handle numbered sections (e.g., "1. Topic", "2. Topic")
        for line in lines:
            if re.match(r'^\d+\.\s', line):  # Matches lines starting with "1. ", "2. ", etc.
                syllabus_topics.add(line.strip())

        # Handle keywords directly in the text
        if any(keyword in text for keyword in ["סילבוס", "Topics", "Outline"]):
            for line in lines:
                # Add lines containing relevant keywords as potential topics
                if any(keyword in line for keyword in ["סילבוס", "Topics", "Outline"]):
                    syllabus_topics.add(line.strip())

    return syllabus_topics


In [6]:
from tabula import read_pdf
import pandas as pd

def extract_table_with_topics_final(pdf_path, topics, pages="all"):
    """
    Extracts tables from a PDF, matches column titles to a list of topics,
    and returns data under matching columns.

    :param pdf_path: Path to the PDF file
    :param topics: List of column titles to match
    :param pages: Pages to extract tables from (default: "all")
    :return: Set of data under matching columns
    """
    matching_data = set()

    try:
        # Extract tables using Tabula
        tables = read_pdf(pdf_path, pages=pages, multiple_tables=True, pandas_options={"header": None})

        if not tables:
            print("No tables found in the PDF.")
            return matching_data

        for i, table in enumerate(tables):
            print(f"Processing Table {i + 1}")

            # Assume the first row is the header
            df = pd.DataFrame(table)
            headers = df.iloc[0]
            df.columns = headers
            df = df[1:]  # Remove the header row

            # Clean up headers for matching
            df.columns = df.columns.str.strip()

            print(f"Normalized Headers: {list(df.columns)}")

            # Check for matching columns
            for column in df.columns:
                if any(topic in column for topic in topics):
                    print(f"Matching column found: {column}")
                    matching_data.update(df[column].dropna().tolist())

    except Exception as e:
        print(f"Error during table extraction: {e}")

    return matching_data


# Example usage
pdf_path = "/mnt/data/bsisi_netunim.pdf"  # Path to your PDF file
topics = ["נושאי השיעור", "Topics", "Outline"]  # List of column headers to search for
file_path1 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf"

# Extract data
matching_data = extract_table_with_topics_final(file_path1, topics)

# Print results
print("Matching Data:")
print(matching_data)


Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


Processing Table 1
Normalized Headers: ['קריאות', 'מועד פרסום', 'נושאי השיעור', 'פגישה']
Matching column found: נושאי השיעור
Matching Data:
{'נורמליזציה  של  נתונים;  תלויות', 'אלגברה רלציונית', 'מיפוי ERD לסכמת בסיס נתונים רלציונית .', 'הנתונים', 'SQL שפת', 'מבוא  על  בסיסי  נתונים  ,DBMS,  סכמות', 'ER)(ותרשים  ;ERD', 'מבוא לתכנות באמצעות בסיסי נתונים', 'מודלים  סמנטיים;  מודל  ישויות - וקשרים', 'ומודלים .', 'פונקציונאליות  ומורכבות;  כללי  נרמול', 'מבוא ל עיבוד תנועות', 'וסינתזה;  תהליך  עיצוב  סכמת  בסיס', 'המודל הרלציוני', 'אופטימיזציה של שאילתות'}


In [4]:
import pdfplumber
import os

def has_valid_table_with_pdfplumber(pdf_path, min_rows=2, min_columns=2):
    """
    Checks if a PDF contains at least one valid table using pdfplumber.

    :param pdf_path: Path to the PDF file
    :param min_rows: Minimum number of rows to validate a table
    :param min_columns: Minimum number of columns to validate a table
    :return: True if at least one valid table is found, False otherwise
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()  # Extract tables from the page
                if tables:
                    for table in tables:
                        # Validate table structure
                        if len(table) >= min_rows and len(table[0]) >= min_columns:
                            print(f"Valid table found in {os.path.basename(pdf_path)} on page {page_num + 1}")
                            return True
        print(f"No valid tables in {os.path.basename(pdf_path)}")
        return False
    except Exception as e:
        print(f"Error processing {os.path.basename(pdf_path)}: {e}")
        return False

In [5]:
import pdfplumber
import re

def extract_syllabus_topics_with_pdfplumber(file_path, topic_patterns):
    """
    Extracts syllabus topics from a course PDF file using pdfplumber and returns them as a set.
    Handles diverse formats such as tables, bullet points, numbered sections, and headers.

    :param file_path: Path to the PDF file
    :param topic_patterns: List of regex patterns to identify syllabus-related sections
    :return: A set of topics from the syllabus
    """
    syllabus_topics = set()
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    # Match topics using provided patterns
                    for pattern in topic_patterns:
                        matches = re.findall(pattern, text, re.DOTALL)
                        for match in matches:
                            # Split potential topics by common delimiters and clean up
                            topics = re.split(r',|;|\n|\•|\.', match)
                            syllabus_topics.update([topic.strip() for topic in topics if topic.strip()])

                    # Handle bullet points
                    lines = text.split("\n")
                    for line in lines:
                        if re.match(r'^\•', line):  # Matches lines starting with "•"
                            syllabus_topics.add(line.lstrip("• ").strip())

                    # Handle numbered sections (e.g., "1. Topic", "2. Topic")
                    for line in lines:
                        if re.match(r'^\d+\.\s', line):  # Matches lines starting with "1. ", "2. ", etc.
                            syllabus_topics.add(line.strip())

                    # Handle keywords directly in the text
                    if any(keyword in text for keyword in ["סילבוס", "Topics", "Outline"]):
                        for line in lines:
                            # Add lines containing relevant keywords as potential topics
                            if any(keyword in line for keyword in ["סילבוס", "Topics", "Outline"]):
                                syllabus_topics.add(line.strip())
    except Exception as e:
        print(f"Error processing PDF with pdfplumber: {e}")

    return syllabus_topics

# Example usage


In [3]:
def extract_syllabus_topic_total(pdf_path):
    topic_patterns = [
    r'סילבוס[:\n](.*?)\n',  # Hebrew pattern for "Syllabus"
    r'סילבוס באנגלית[:\n](.*?)\n',
    r'סילבוס בעברית[:\n](.*?)\n',
]
    topic_patterns1 = [
    r'נושאים[:\n](.*?)\n',  # Hebrew pattern for "Topics"
    r'Course Topics[:\n](.*?)\n',
    r'Outline[:\n](.*?)\n',
]
    topics_table = ["נושאי השיעור","נושא השיעור", "Topics", "Outline"]  # List of column headers to search for

    topics = set()
    # print(topics)
    pdf_copy = pdf_path
    has_table = has_valid_table_with_pdfplumber(pdf_path)
    if not has_table:
        topics = extract_syllabus_topics_with_pdfplumber(pdf_copy,topic_patterns)
        if len(topics)==0:
            topics = extract_syllabus_topics_with_pdfplumber(pdf_path,topic_patterns1)
    else:
   
         topics = extract_table_with_topics_final(pdf_path,topics_table )
    #     topics = extract_syllabus_topic9(pdf_path)
    # cleaned_topics = {topic.lstrip("• ").strip() for topic in topics}
    cleaned_topics = set()
    for topic in topics:
        # Remove leading numbers (e.g., "1.", "2. ", etc.)
        topic = re.sub(r"^\d+\.\s*", "", topic)
        # Remove leading special characters like "•", "*", etc.
        topic = topic.lstrip("•* ").strip()
        if topic:  # Only keep non-empty topics
            cleaned_topics.add(topic)
    

    return cleaned_topics

In [8]:
file_path = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf"  # Replace with the correct path to the PDF
file_pdf1 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf"
pdf_path3 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/histabrut.pdf"
pdf_4 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/compilatin.pdf"
pdf_5 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/decsionmaker.pdf"
pdf_6 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki.pdf"
pdf_7 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/dc_mavo.pdf"
pdf_8 =    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/modelim.pdf" 
pdf_9 =     "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/eimut.pdf"

In [9]:
import pdfplumber

def extract_text_with_pdfplumber(file_path):
    with pdfplumber.open(file_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            text = page.extract_text()
            full_text += text or ""  # Add text if it exists
        return full_text

# Test the function
# file_path = "path_to_your_pdf.pdf"
extracted_text = extract_text_with_pdfplumber(file_path)
print(extracted_text)


ב גנב ןוירוג-ןב תטיסרבינוא
בשחמה יעדמל הקלחמה - עבטה יעדמל הטלוקפה
ב"פשת 'ב רטסמס
סרוק סובליס
ם ינותנ ינבמ : סרוק םש
Data Structures :תילגנאב סרוק םש
202-1-1031 :סרוק רפסמ
הבוח :סרוק גוס
5.0 :ז"קנ
שמש לכימ 'בג ,רוצ לקד 'פורפ ,ימרכ זפ 'פורפ :סרוקה הצרמ
:םדק תושירד
202-1-1011 – בשחמה יעדמל אובמ
:תילגנאב סובליס
• Growth of functions & algorithm analysis
• Recurrences
• Basic ADT’s: stacks & queues
• Binary search trees
• AVL-Trees
• B-Trees
• Probability basics
• Skip lists
• Hash tables
• Bloom filter
• Priority queues (heaps).
• Compression: Huffman, Lempel-Ziv
• Quicksort
• Median (deterministic and randomized algorithms).
• Sorting in Linear Time
• Elementary graph algorithms: BFS, DFS, topological Sortב גנב ןוירוג-ןב תטיסרבינוא
בשחמה יעדמל הקלחמה - עבטה יעדמל הטלוקפה
ב"פשת 'ב רטסמס
• Amortized analysis
• Data structures for disjoint sets (union find)
• MST: Kruskal, Prim
.סרוקה תינכותב םייוניש ונכתי
:סרוקה אשונו תרטמ
ןונכתלו בשחמב םינותנ ינבמב שומישו הרדגהל תיטקרטסבא תלוכי חתפל איה ס

In [19]:
pdf_files = [
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/histabrut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/compilatin.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/decsionmaker.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/eimut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/modelim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/dc_mavo.pdf"
]

In [26]:
# file_path = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf"  # Replace with the correct path to the PDF
# file_path1 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf"
# pdf2 = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf"
pdf_analiza = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/NegevNerds/syllabus_anl.pdf"
res6 = extract_syllabus_topic_total("/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/cropped/mavo_miki_cropped.pdf")
print(len(res6))
res6

Valid table found in mavo_miki_cropped.pdf on page 2
Processing Table 1
Normalized Headers: ['קריאה נדרשת', 'נושא השיעור', np.float64(nan), "מס'"]
Matching column found: נושא השיעור
Error during table extraction: argument of type 'numpy.float64' is not iterable
20


{'(outlier  (  חריגים  זיהוי  .)data cleaning (',
 ') matplotlib ו express',
 ')Pandas (  DataFrames עבודהעם',
 '.)datamanipulation(  ניקוי נתונים  עיבוד נתונים',
 '.)intelligence',
 '.)transformation',
 '.Entity-Relationship מודל',
 '.NoSQL גישת',
 'Regular Expressions כמוכןנלמדלעבודעם',
 'דוגמאות ש ל פרויקטים מעשיים .',
 'המרת data  (  נתונים  .detection',
 'הנתונים  כולל  הרקע  ההיסטורי  והטכנולוגי .',
 'ויזואליזציה  של  נתונים plotly   עם  (עבודה',
 'לשפת  .SQL',
 'מבוא  לקורס .  סקירה  כללית  של  תחום  מדעי',
 'מחסני  נתונים business  (  עסקית  ובינה',
 'נתונים , מאגרי  נתונים , תכנון  מסדי  נתונים',
 'עבודה עם קבצים',
 'רלציוניים .',
 'שליפת  מידע  ממסדי  נתונים  רלציוניים .  מבוא'}

In [142]:
pdf_files = [
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/histabrut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/compilatin.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/decsionmaker.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/eimut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/modelim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/dc_mavo.pdf"
]

No valid tables in data_strcutre.pdf
21


{'AVL-Trees',
 'Amortized analysis',
 'B-Trees',
 'Basic ADT’s: stacks & queues',
 'Binary search trees',
 'Bloom filter',
 'Compression: Huffman, Lempel-Ziv',
 'Data structures for disjoint sets (union find)',
 'Elementary graph algorithms: BFS, DFS, topological Sort',
 'Growth of functions & algorithm analysis',
 'Hash tables',
 'Introduction to Algorithms (3rd edition), Cormen, Leiserson, Rivest and',
 'MST: Kruskal, Prim',
 'Median (deterministic and randomized algorithms).',
 'Open data structures, Pat Morin.',
 'Priority queues (heaps).',
 'Probability basics',
 'Quicksort',
 'Recurrences',
 'Skip lists',
 'Sorting in Linear Time'}

In [146]:

# List of PDF files to check
pdf_files = [
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/histabrut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/compilatin.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/decsionmaker.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/eimut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/modelim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/dc_mavo.pdf"
]

# Check each file for tables
# for pdf_file in pdf_files:
#     result = has_valid_table_with_pdfplumber(pdf_file)
#     print(f"{os.path.basename(pdf_file)} has table: {result}")


In [2]:
import pdfplumber
from PyPDF2 import PdfWriter, PdfReader
import os

def crop_pdf_top_margin(pdf_path, margin_cm=4.0):
    """
    Crops a specified top margin (in centimeters) from all pages of a PDF,
    saves the result in the same directory with '_cropped' appended to the file name,
    and returns the path to the new PDF.
    
    :param pdf_path: Path to the original PDF file.
    :param margin_cm: Top margin to crop, specified in centimeters.
    :return: Path to the cropped PDF.
    """
    # Convert centimeters to points (1 cm = 28.35 points)
    cm_to_points = margin_cm * 28.35

    # Prepare output file path (same directory, _cropped appended)
    dir_name = os.path.dirname(pdf_path)
    base_name = os.path.basename(pdf_path).replace('.pdf', '_cropped.pdf')
    output_path = os.path.join(dir_name, base_name)

    # Initialize PDF writer
    pdf_writer = PdfWriter()

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            width, height = page.width, page.height
            # Crop the top margin
            cropped_page_bbox = (0, cm_to_points, width, height)

            # Use PyPDF2 to adjust page size
            reader = PdfReader(pdf_path)
            page_to_write = reader.pages[i]
            page_to_write.mediabox.upper_left = (0, height - cm_to_points)

            # Add the adjusted page to the writer
            pdf_writer.add_page(page_to_write)

    # Save the cropped PDF to the same directory
    with open(output_path, "wb") as out_file:
        pdf_writer.write(out_file)

    print(f"Cropped PDF saved to: {output_path}")
    return output_path


In [1]:
pdf_files = [
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/data_strcutre.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/bsisi_netunim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/histabrut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/compilatin.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/decsionmaker.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/eimut.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/modelim.pdf",
    "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/dc_mavo.pdf"
]

In [7]:
ouptdir = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/cropped"
pdf_analiza = "/Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/NegevNerds/syllabus_anl.pdf"
cropt_pdf =crop_pdf_top_margin(pdf_files[5])
res7 = extract_syllabus_topic_total(cropt_pdf)
res7

Cropped PDF saved to: /Users/davidvolodarsky/Desktop/Semeters/Semester_G/NegevNerds/sylbus_analyzer/mavo_miki_cropped.pdf
Valid table found in mavo_miki_cropped.pdf on page 2
Processing Table 1
Normalized Headers: ['קריאה נדרשת', 'נושא השיעור', np.float64(nan), "מס'"]
Matching column found: נושא השיעור
Error during table extraction: argument of type 'numpy.float64' is not iterable


{'(outlier  (  חריגים  זיהוי  .)data cleaning (',
 ') matplotlib ו express',
 ')Pandas (  DataFrames עבודהעם',
 '.)datamanipulation(  ניקוי נתונים  עיבוד נתונים',
 '.)intelligence',
 '.)transformation',
 '.Entity-Relationship מודל',
 '.NoSQL גישת',
 'Regular Expressions כמוכןנלמדלעבודעם',
 'דוגמאות ש ל פרויקטים מעשיים .',
 'המרת data  (  נתונים  .detection',
 'הנתונים  כולל  הרקע  ההיסטורי  והטכנולוגי .',
 'ויזואליזציה  של  נתונים plotly   עם  (עבודה',
 'לשפת  .SQL',
 'מבוא  לקורס .  סקירה  כללית  של  תחום  מדעי',
 'מחסני  נתונים business  (  עסקית  ובינה',
 'נתונים , מאגרי  נתונים , תכנון  מסדי  נתונים',
 'עבודה עם קבצים',
 'רלציוניים .',
 'שליפת  מידע  ממסדי  נתונים  רלציוניים .  מבוא'}

In [36]:
res7 = extract_syllabus_topic_total(cropt_pdf)
res7

Valid table found in mavo_miki_cropped.pdf on page 2
Processing Table 1
Normalized Headers: ['קריאה נדרשת', 'נושא השיעור', np.float64(nan), "מס'"]
Matching column found: נושא השיעור
Error during table extraction: argument of type 'numpy.float64' is not iterable


{'(outlier  (  חריגים  זיהוי  .)data cleaning (',
 ') matplotlib ו express',
 ')Pandas (  DataFrames עבודהעם',
 '.)datamanipulation(  ניקוי נתונים  עיבוד נתונים',
 '.)intelligence',
 '.)transformation',
 '.Entity-Relationship מודל',
 '.NoSQL גישת',
 'Regular Expressions כמוכןנלמדלעבודעם',
 'דוגמאות ש ל פרויקטים מעשיים .',
 'המרת data  (  נתונים  .detection',
 'הנתונים  כולל  הרקע  ההיסטורי  והטכנולוגי .',
 'ויזואליזציה  של  נתונים plotly   עם  (עבודה',
 'לשפת  .SQL',
 'מבוא  לקורס .  סקירה  כללית  של  תחום  מדעי',
 'מחסני  נתונים business  (  עסקית  ובינה',
 'נתונים , מאגרי  נתונים , תכנון  מסדי  נתונים',
 'עבודה עם קבצים',
 'רלציוניים .',
 'שליפת  מידע  ממסדי  נתונים  רלציוניים .  מבוא'}

In [None]:
public void dfs(int depthLimit, int minFreeMemory) {
    boolean depthLimitReached = false;
    int depth = 0;
    while (true) {
        if (checkAndResetBacktrackRequest() || !isNewState() || isEndState() ||
            isIgnoredState() || depthLimitReached) {
            if (!backtrack()) { // backtrack not possible, done
            return;
            }
            depthLimitReached = 
                false;
            depth--;
            notifyStateBacktracked();
            } if (
                forward()) {
                depth++;
                notifyStateAdvanced();
                if (currentError != null) {
                    notifyPropertyViolated();
                    if (hasPropertyTermination()) {
                    return;
                    }
                } if (
                    depth >= depthLimit) {
                    depthLimitReached = true;
                    notifySearchConstraintHit("depth limit reached: " + depthLimit);
                    continue;
                } if (!checkStateSpaceLimit(minFreeMemory)) {
                    notifySearchConstraintHit("memory limit reached: " + minFreeMemory);
                    return;
                }
                } else { // forward did not execute any instructions
                notifyStateProcessed();
                }
    }
