In [1]:
from bs4 import BeautifulSoup as bs
from BS_extension import MyDivExtension
import markdown
from Generate_slides import HeaderSlideCreator, ListSlideCreator, CodeSlideCreator, ImageSlideCreator
import os
import re
from markdownify import markdownify as md
from input_methods import InputMethods, FileNameMethods
from copy import deepcopy

In [7]:
class Parser:
    def __init__(self, markdown_path, created_slides_path) -> None:
        self.created_slides_path = created_slides_path
        self.markdown_path = markdown_path
        self.slide_counter = 0
        
        self.input_extractors = {
            'h1': InputMethods.get_input_h1,
            'img': InputMethods.get_input_img,
            'code': InputMethods.get_input_code,
            ('div', 'mydiv'): InputMethods.get_input_list  # Maps div with class 'mydiv' to get_input_list
        }
        self.slide_creators = {
            'h1': HeaderSlideCreator,
            'img': ImageSlideCreator,
            'code': CodeSlideCreator,
            ('div', 'mydiv'): ListSlideCreator
        }
        self.file_name_extractors = {
            'h1': FileNameMethods.get_file_name_h1,
            'img': FileNameMethods.get_file_name_img,
            'code': FileNameMethods.get_file_name_code,
            ('div', 'mydiv'): FileNameMethods.get_file_name_list
        }            
            


    def _create_soup(self):
        with open(self.markdown_path) as f:
            text = f.read()
        html = markdown.markdown(text, extensions=[MyDivExtension()])
        return bs(html, "html.parser")
    
    # Function to get the indices of elements to process
    @staticmethod
    def _get_element_indices(soup):
        def element_filter(tag):
            # Check if tag is 'h1', 'img', or 'code'
            if tag.name in ["h1", "img", "code"]:
                return True
            # Check if tag is 'div' with class 'mydiv'
            if tag.name == "div" and tag.get("class") == ["mydiv"]:
                return True
            return False

        # Return the indices of elements in 'soup' that match the filter
        return [i for i, element in enumerate(soup.find_all()) if element_filter(element)]
    
    def generate_href_html(self):
        html_for_editing = self._create_soup()
        all_elements = list(html_for_editing.find_all())
        element_indices = self._get_element_indices(html_for_editing)

        # Loop over each index and insert a new paragraph with the count
        for count, index in enumerate(element_indices, start=1):
            # Find the corresponding element in 'soup_md'
            corresponding_element = all_elements[index]
            # get filename
            filename = [f for f in os.listdir("created_slides") if f.startswith(f"{count:02}")][0]
            # insert an a tag with the href to the created image
            link_tag = html_for_editing.new_tag("a", href=f"./created_slides/{filename}")
            corresponding_element.append(link_tag)

        return html_for_editing
    
    def _save_slide(self, image, filename):
        # Ensure directory exists
        os.makedirs(self.created_slides_path, exist_ok=True)
        
        # Create full path for saving
        full_path = os.path.join(self.created_slides_path, filename)
        
        # Save the image
        image.save(full_path)
        print(f"Slide saved at {full_path}")
                

    def parse(self):
        soup = self._create_soup()

        for element in soup.find_all():
            # Check if both element name and class match a specific tuple key
            if (element.name, element.get('class')[0] if element.get('class') else None) in self.input_extractors:
                key = (element.name, element.get('class')[0])
            else:
                key = element.name

            try:
                # Extract input for the Slide creation from element
                input = self.input_extractors[key](element)

                # Instantiate the correct SlideCreator class and create the slide
                slide_creator = self.slide_creators[key]()
                if key == 'img' or key == 'h1':
                    slide = slide_creator.create_slide(input)
                elif key == 'code' or key == ('div', 'mydiv'):
                    slide = slide_creator.create_slide(*input)
                self.slide_counter += 1
    	        # Get file name 
                file_name = f"{self.slide_counter:02}_{element.name}_{self.file_name_extractors[key](element)}.png"
                

                # Save the slide
                self._save_slide(slide, file_name)



            except KeyError:
                print(f"Element {element.name} not supported")
                
        pass
            


In [8]:
parser = Parser("test.md", "created_slides")
#parser.parse()
html = parser.generate_href_html()
print(html.prettify())


<h1>
 Create Venv
 <a href="./created_slides/01_h1_Create_Venv.png">
 </a>
</h1>
<p>
 <code>
  python
print("Hello, world!")
for i in range(10):
    print(i)
  <a href="./created_slides/02_code_.png">
  </a>
 </code>
</p>
<p>
 <code>
  typescript hello world code
console.log('Hello, world!');
for (let i = 0; i &lt; 10; i++) {
    console.log(i);
}
  <a href="./created_slides/03_code_hello_world_code.png">
  </a>
 </code>
</p>
<p>
 <img alt="Cyan Blue Image" src="./backgrounds/cyan_blue(5).png">
  <a href="./created_slides/04_img_Cyan_Blue_Image.png">
  </a>
 </img>
</p>
<div class="mydiv">
 <h3>
  List Header
 </h3>
 <ul>
  <li>
   first item
  </li>
  <li>
   second item
  </li>
  <li>
   third item
  </li>
 </ul>
 <a href="./created_slides/05_div_List_Header.png">
 </a>
</div>
<h1>
 pip install bs4
 <a href="./created_slides/06_h1_pip_install_bs4.png">
 </a>
</h1>
<h3>
 another h3
</h3>
<p>
 some more text
</p>
<p>
 ðŸ˜Ž
</p>



In [11]:
with open("test.md") as f:
    text = f.read()

# create html code with markdownify
html = markdown.markdown(text, extensions=[MyDivExtension()])

soup = bs(html, "html.parser")
soup_md = deepcopy(soup)
print(soup.prettify())

<h1>
 Create Venv
</h1>
<p>
 <code>
  python
print("Hello, world!")
for i in range(10):
    print(i)
 </code>
</p>
<p>
 <code>
  typescript hello world code
console.log('Hello, world!');
for (let i = 0; i &lt; 10; i++) {
    console.log(i);
}
 </code>
</p>
<p>
 <img alt="Cyan Blue Image" src="./backgrounds/cyan_blue(5).png"/>
</p>
<div class="mydiv">
 <h3>
  List Header
 </h3>
 <ul>
  <li>
   first item
  </li>
  <li>
   second item
  </li>
  <li>
   third item
  </li>
 </ul>
</div>
<h1>
 pip install bs4
</h1>
<h3>
 another h3
</h3>
<p>
 some more text
</p>
<p>
 ðŸ˜Ž
</p>



In [None]:
# Function to get the indices of elements to process
def get_element_indices(soup):
    def element_filter(tag):
        # Check if tag is 'h1', 'img', or 'code'
        if tag.name in ["h1", "img", "code"]:
            return True
        # Check if tag is 'div' with class 'mydiv'
        if tag.name == "div" and tag.get("class") == ["mydiv"]:
            return True
        return False

    # Return the indices of elements in 'soup' that match the filter
    return [i for i, element in enumerate(soup.find_all()) if element_filter(element)]

# Function to insert a new paragraph with a string that includes the count
def insert_href(soup_md, element_indices):
    all_elements = list(soup_md.find_all())

    # Loop over each index and insert a new paragraph with the count
    for count, index in enumerate(element_indices, start=1):
        # Find the corresponding element in 'soup_md'
        corresponding_element = all_elements[index]
        # insert an a tag with the href to the created image
        link_tag = soup_md.new_tag("a", href=f"./created_slides/{count:02}_img.png")
        corresponding_element.append(link_tag)
        
        """
        # Find the parent <p> tag of the corresponding element
        parent_p = corresponding_element.find_parent("p")
        
        # Only insert if there's a parent <p> tag
        if parent_p:
            # Create a new <p> tag with the count as text
            new_paragraph = soup_md.new_tag("p")
            new_paragraph.string = f"Count: {count}"

            # get the full filename with path of the file that starts with {count:02} in the created_slides directory
            filename = [f for f in os.listdir("created_slides") if f.startswith(f"{count:02}")][0]
            print(filename)
            
            # Insert the new paragraph after the parent <p> tag
            #parent_p.insert_after(new_paragraph)
            # Insert the image after the new paragraph
            # Create the new img element
            new_img = soup_md.new_tag("img", src=filename)
            new_a = soup_md.new_tag("a", href=f"./created_slides/{filename}")

            # Insert the new img element after the existing "img" element
            parent_p.insert_after(new_a)

        else:
            new_paragraph = soup_md.new_tag("p")
            #new_paragraph.string = f"Count: {count}"
            filename = [f for f in os.listdir("created_slides") if f.startswith(f"{count:02}")][0]
            new_img = soup_md.new_tag("img", src=filename)
            # create a link to the image that includes the directory path
            new_a = soup_md.new_tag("a", href=f"./created_slides/{filename}")
            corresponding_element.insert_after(new_a)
        """
    pass

# Example usage
# Assume 'soup' is the original BeautifulSoup object
soup_md = deepcopy(soup)

# Step 1: Get indices of target elements in 'soup'
element_indices = get_element_indices(soup)

indices = get_element_indices(soup)
print(indices)

# Step 2: Insert <p> tags with count strings in 'soup_md' after the parent <p> tags
insert_count_paragraph(soup_md, element_indices)

print(soup_md.prettify())

# use markdownify to convert html to markdown
markdown_text = md(str(soup_md))

# replace all single backticks with triple backticks
markdown_text = re.sub(r"`", "```", markdown_text)
# save the markdown text to a file 
with open("test_with_links.md", "w") as f:
    f.write(markdown_text)

[0, 2, 4, 6, 7, 13]
<h1>
 Create Venv
 <a href="./created_slides/01_img.png">
 </a>
</h1>
<p>
 <code>
  python
print("Hello, world!")
for i in range(10):
    print(i)
  <a href="./created_slides/02_img.png">
  </a>
 </code>
</p>
<p>
 <code>
  typescript hello world code
console.log('Hello, world!');
for (let i = 0; i &lt; 10; i++) {
    console.log(i);
}
  <a href="./created_slides/03_img.png">
  </a>
 </code>
</p>
<p>
 <img alt="Cyan Blue Image" src="./backgrounds/cyan_blue(5).png">
  <a href="./created_slides/04_img.png">
  </a>
 </img>
</p>
<div class="mydiv">
 <h3>
  List Header
 </h3>
 <ul>
  <li>
   first item
  </li>
  <li>
   second item
  </li>
  <li>
   third item
  </li>
 </ul>
 <a href="./created_slides/05_img.png">
 </a>
</div>
<h1>
 pip install bs4
 <a href="./created_slides/06_img.png">
 </a>
</h1>
<h3>
 another h3
</h3>
<p>
 some more text
</p>
<p>
 ðŸ˜Ž
</p>



In [None]:

#print(soup.prettify())	# print the html code

for element in soup.find_all():
    parent_p = element.find_parent("p")
    if element.name == "code":
        # add paragraph with "hello world" after each code block 
        new_tag = soup.new_tag("p")
        new_tag.string = "hello world"
        parent_p.insert_after(new_tag)
        
#print(soup.prettify())
# use markdownify to convert html to markdown
markdown_text = md(str(soup))
print(markdown_text)
#print("-------------------")
# replace all single backticks with triple backticks
markdown_text = re.sub(r"`", "```", markdown_text)
print(markdown_text)



    

Create Venv


`python python code
print("Hello, world!")
for i in range(10):
 print(i)`

hello world


`typescript hello world code
console.log('Hello, world!');
for (let i = 0; i < 10; i++) {
 console.log(i);
}`

hello world


![Cyan Blue Image](./backgrounds/cyan_blue(5).png)



### List Header


* first item
* second item
* third item



pip install bs4


### another h3


some more text


ðŸ˜Ž


Create Venv


```python python code
print("Hello, world!")
for i in range(10):
 print(i)```

hello world


```typescript hello world code
console.log('Hello, world!');
for (let i = 0; i < 10; i++) {
 console.log(i);
}```

hello world


![Cyan Blue Image](./backgrounds/cyan_blue(5).png)



### List Header


* first item
* second item
* third item



pip install bs4


### another h3


some more text


ðŸ˜Ž




In [6]:


class Parse_markdown:
    def __init__(self, markdown_file_path, slides_director) -> None:
        self.markdown_file_path = markdown_file_path
        self.slides_directory = slides_director

    def _create_soup_object(self):
        # open markdown file
        with open(self.markdown_file_path, 'r') as f:
            data = f.read()

        # parse markdown into html
        html = markdown.markdown(data, extensions=[MyDivExtension()])

        # parse markdown file
        soup = bs(html, 'html.parser')
        return soup
        
    def _create_header(self, element):
        header_text = element.text
        Creator = HeaderSlideCreator()
        return Creator.create_slide(header_text)
    
    def _get_h1_text(self, element):
        header_text = element.text
        return header_text

    def _create_list(self, element):
        header_text, list_items = self._get_list_header_and_items(element)            
        Creator = ListSlideCreator()
        return Creator.create_slide(header_text, list_items)
    
    def _get_list_header_and_items(self, element):
        for header in element.find_all('h3'):
            header_text = header.text
            
        for ul in element.find_all('ul'):
            list_items = ul.find_all('li')
            list_items = [item.text for item in list_items]
        
        return header_text, list_items

    def _create_code(self, element):
        code_language = self._get_code_language_title_content(element)[0]
        code_snippet = self._get_code_language_title_content(element)[2]

        Creator = CodeSlideCreator()
        return Creator.create_slide(code_snippet, code_language)
    
    def _get_code_language_title_content(self, element):
        All = element.text.split('\n', 1)
        language_and_title = All[0]
        # split the string into the language and the title except in the first line is only one word
        if " " not in language_and_title:
            code_language = language_and_title
            code_title = "no_title"
        else:
            code_language, code_title = re.split(r'\s', All[0], 1)
        
        code_snippet = All[1]
        return code_language, code_title, code_snippet

    def _create_image(self, element):
        image_path = element['src']
        Creator = ImageSlideCreator()
        return Creator.create_slide(image_path)
    
    def _get_image_src(self, element):
        image_alt = element['src']
        return image_alt
    """
    def parse(self):
        soup = self._create_soup_object()

        for i, element in enumerate(soup.find_all()):
            if element.name == 'h1':
                header = self._create_header(element)
                header.save(os.path.join(self.slides_directory, f"header_{i}.png"))
            elif element.name == 'div' and element['class'] == ['mydiv']:
                resulting_list = self._create_list(element)
                resulting_list.save(os.path.join(self.slides_directory, f"list_{i}.png"))
            elif element.name == 'code':
                code = self._create_code(element)
                code.save(os.path.join(self.slides_directory, f"code_{i}.png"))
            elif element.name == 'img':
                image = self._create_image(element)
                image.save(os.path.join(self.slides_directory, f"image_{i}.png"))
            else:
                pass
    """
    
    
    def _find_element_position_in_markdown(self, markdown_lines: list, element) -> int:
        """
        Find the position of a BeautifulSoup element in the original markdown file.
        
        Args:
            markdown_lines (list): List of lines from the original markdown file
            element (bs4.element.Tag): BeautifulSoup element to find
            
        Returns:
            int: Line number (index) where the element was found, or None if not found
        """
        # Convert markdown_lines to a list if it's not already and preserve empty lines
        if isinstance(markdown_lines, str):
            markdown_lines = markdown_lines.split('\n')
        
        if element.name == 'h1':
            header_text = self._get_h1_text(element)
            for i, line in enumerate(markdown_lines):
                if line.strip().startswith('# ') and header_text in line:
                    return i
        
        elif element.name == 'div' and element.get('class') == ['mydiv']:
            header_text, _ = self._get_list_header_and_items(element)
            for i, line in enumerate(markdown_lines):
                if line.strip().startswith('### ') and header_text in line:
                    return i
        


        elif element.name == 'code':
            code_lang, code_title, code_snippet = self._get_code_language_title_content(element)
            snippet_start = code_snippet.strip().split('\n')[0]  # First line of code snippet

            for i, line in enumerate(markdown_lines):
                # Use regex to match the opening code fence, language, and title
                match = re.match(r'^(```)\s*(\w+)\s*(.*)', line.strip())
                
                if match:
                    backticks, lang, title = match.groups()
                    print(f"Parts: [{backticks}, {lang}, {title}]")  # For debugging
                    
                    if backticks == '```' and lang == code_lang and title == code_title:
                        print(f"Found opening code fence at line {i} with content: {line}")
                        # Look for snippet start
                        current_pos = i + 1
                        while current_pos < len(markdown_lines):
                            # If first line of snippet is found, search for the closing fence
                            if snippet_start in markdown_lines[current_pos]:
                                print(f"Found snippet start at line {current_pos} with content: {markdown_lines[current_pos]}")
                                # Now look for the closing fence
                                for closing_pos in range(current_pos + 1, len(markdown_lines)):
                                    if markdown_lines[closing_pos].strip() == '```':
                                        print(f"Found closing fence at line {closing_pos}")
                                        return closing_pos + 1  # Position after the closing fence
                            current_pos += 1






        
        elif element.name == 'img':
            img_alt = element['alt']#self._get_image_alt(element)
            img_src = element['src']
            img_pattern = f"![{img_alt}]({img_src})"
            
            for i, line in enumerate(markdown_lines):
                if img_pattern in line:
                    return i
        
        return None

    def parse(self):
        soup = self._create_soup_object()
        # Read lines while preserving empty lines
        with open(self.markdown_file_path, 'r') as md_file:
            markdown_lines = md_file.readlines()
        
        # Remove trailing newlines but preserve empty lines
        markdown_lines = [line.rstrip('\n') for line in markdown_lines]
        
        # Keep track of insertions to adjust positions
        offset = 0
        
        for i, element in enumerate(soup.find_all()):
            file_path = None
            link_text = ""
            
            # Process each element and generate a file path
            if element.name == 'h1':
                header_text = self._get_h1_text(element)
                header_text_without_spaces = header_text.replace(" ", "_")  
                header_slide = self._create_header(element)
                file_path = os.path.join(self.slides_directory, 
                                        f"{i:02}_header_{header_text_without_spaces}.png")
                header_slide.save(file_path)
                link_text = f"![{i:02}_header_{header_text}]({file_path})"
                # add a tag in the html to link to the slide with a custom class
                
                
            elif element.name == 'div' and element['class'] == ['mydiv']:
                list_title = self._get_list_header_and_items(element)[0]
                list_title_without_spaces = list_title.replace(" ", "_")
                list_slide = self._create_list(element)
                file_path = os.path.join(self.slides_directory, 
                                        f"{i:02}_list_{list_title_without_spaces}.png")
                list_slide.save(file_path)
                link_text = f"![{i:02}_list_{list_title}]({file_path})"

            elif element.name == 'code':
                code_title = self._get_code_language_title_content(element)[1]
                code_title_without_spaces = code_title.replace(" ", "_")
                code_slide = self._create_code(element)
                file_path = os.path.join(self.slides_directory, 
                                        f"{i:02}_code_{code_title_without_spaces}.png")
                code_slide.save(file_path)
                link_text = f"![{i:02}_code_{code_title}]({file_path})"
            
            elif element.name == 'img':
                image_src = self._get_image_src(element)
                image_src = image_src.split('/')[-1]# remove the path from the image_src and just keep the name
                image_src_without_spaces = image_src.replace(" ", "_")
                image_slide = self._create_image(element)
                file_path = os.path.join(self.slides_directory, 
                                        f"{i:02}_image_{image_src_without_spaces}")
                image_slide.save(file_path)
                link_text = f"![{i:02}_image_{image_src}]({file_path})"

            # Insert the generated link
            if file_path and link_text:
                element_position = self._find_element_position_in_markdown(markdown_lines, element)
                if element_position is not None:
                    # For code blocks, find the end of the block to place the link
                    if element.name == 'code':
                        # Find the closing code fence
                        for j in range(element_position + 1, len(markdown_lines)):
                            if markdown_lines[j].strip() == '```':
                                element_position = j + 1
                                break
                    
                    # Insert link while preserving empty lines
                    if element_position + offset < len(markdown_lines):
                        markdown_lines.insert(element_position + offset, link_text)
                        offset += 1
                    else:
                        markdown_lines.append(link_text)
                else:
                    print(f"Warning: Could not find position for element '{element}' in the original Markdown.")

        # Write the modified Markdown content back to the file
        with open(self.markdown_file_path, 'w') as md_file:
            md_file.write('\n'.join(markdown_lines))
        print("Markdown file updated with links to generated files.")

    # Helper functions
    def _read_markdown_lines(self):
        with open(self.markdown_file_path, 'r') as md_file:
            return md_file.readlines()

In [7]:
parser = Parse_markdown('test.md', 'Created_slides')
parser.parse()

Parts: [```, python, python code]
Found opening code fence at line 3 with content: ```python python code
Found snippet start at line 4 with content: print("Hello, world!")
Found closing fence at line 7
Parts: [```, python, python code]
Parts: [```, typescript, hello world code]
Found opening code fence at line 9 with content: ```typescript hello world code
Found snippet start at line 10 with content: console.log('Hello, world!');
Found closing fence at line 14
Markdown file updated with links to generated files.


In [None]:
        """
        elif element.name == 'code':
            code_lang, code_title, code_snippet = self._get_code_language_title_content(element)
            snippet_start = code_snippet.strip().split('\n')[0]  # Get first line of code
            
            for i, line in enumerate(markdown_lines):
                # Look for code fence start
                if line.strip().startswith('```'):
                    # Check if next non-empty line matches start of code snippet
                    current_pos = i + 1
                    while current_pos < len(markdown_lines):
                        if markdown_lines[current_pos].strip():  # Found first non-empty line
                            if snippet_start in markdown_lines[current_pos]:
                                return i  # Return position of code fence
                            break
                        current_pos += 1
        """
        

In [None]:
"""
        elif element.name == 'code':
            code_lang, code_title, code_snippet = self._get_code_language_title_content(element)
            snippet_lines = code_snippet.strip().split('\n')
            snippet_start = snippet_lines[0]  # First line of code
            snippet_end = snippet_lines[-1]   # Last line of code
            print("snippet_start", snippet_start)
            print("snippet_end", snippet_end)

            for i, line in enumerate(markdown_lines):
                # Look for code fence start
                if line.strip().startswith('```'):
                    print(f"found code fence with title: {code_title} in line {i}")
                    # Check if the next non-empty line matches the start of code snippet
                    current_pos = i + 1
                    while current_pos < len(markdown_lines):
                        if markdown_lines[current_pos].strip() == snippet_start:
                            # Start of snippet found, now look for the last line of the snippet
                            print(f"found start of code snippet in line {current_pos}")
                            while current_pos < len(markdown_lines):
                                if markdown_lines[current_pos].strip() == snippet_end:
                                    print(f"found end of code snippet in line {current_pos}")
                                    return current_pos + 2  # Position two lines after the end of the snippet
                                current_pos += 1
                            break
                        current_pos += 1

        """ 