In [1]:
from bs4 import BeautifulSoup as bs
import markdown
import re

In [2]:
import markdown
from markdown.extensions import Extension
from markdown.blockprocessors import BlockProcessor
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET

class MyDivExtension(Extension):
    """
    Custom Markdown extension to wrap content in a <div> with the "mydiv" class.
    """
    class MyDivProcessor(BlockProcessor):
        def test(self, parent, block):
            return block.startswith('###mydiv\n')

        def run(self, parent, blocks):
            block = blocks.pop(0)
            div = ET.SubElement(parent, 'div')
            div.set('class', 'mydiv')
            self.parser.parseBlocks(div, [block[8:].strip()])

    def extendMarkdown(self, md):
        md.registerExtension(self)
        md.parser.blockprocessors.register(self.MyDivProcessor(md.parser), 'mydiv', 175)

In [4]:
class HeaderSlideCreator:
    def __init__(self, Header_text):
        self.Header_text = Header_text
        print("HeaderImageCreator initialized")

    def check_inputs(self):
        if self.Header_text == "":
            print("Header text is empty")
            return False
        return print("all Inputs are valid")
        
class ListSlideCreator:
    def __init__(self, List_header : str, List_items : list):
        self.List_header = List_header
        self.List_items = List_items
        print("ListCreator initialized")

    def check_inputs(self):
        if self.List_header == "":
            return print("List header is empty")
        if len(self.List_items) == 0:
            return print("List items are empty")
        else:
            return print("all Inputs are valid")
    
class ImageSlideCreator:
    def __init__(self, Image_path):
        self.Image_path = Image_path
        print("ImageSlideCreator initialized")

    def check_inputs(self):
        if self.Image_path == "":
            print("Image path is empty")
            return print("Image path is empty")
        return print("all Inputs are valid")
        
class CodeSlideCreator:
    def __init__(self, Language : str, Code : str):
        self.Language = Language
        self.Code = Code
        print("CodeSlideCreator initialized")

    def check_inputs(self):
        if self.Language == "":
            print("Language is empty")
            return False
        if type(self.Language) != str:
            return print("Language is not a string")
        if self.Code == "":
            return print("Code is empty")
        if type(self.Code) != str:
            return print("Code is not a string")
        return print("all Inputs are valid")

In [13]:
# open markdown file
with open('test.md', 'r') as f:
    data = f.read()

# parse markdown into html
html = markdown.markdown(data, extensions=[MyDivExtension()])



# parse markdown file
soup = bs(html, 'html.parser')

# loop through the html and create slides
all_elements = soup.find_all()
for element in all_elements:
    if element.name == 'h1':
        header_text = element.text
        Instance = HeaderSlideCreator(element.text)
        Instance.check_inputs()

    elif element.name == 'div' and element['class'] == ['mydiv']:
        for header in element.find_all('h3'):
            header_text = header.text
            
        for ul in element.find_all('ul'):
            list_items = ul.find_all('li')
            list_items = [item.text for item in list_items]
            
        Instance = ListSlideCreator(header_text, list_items)
        Instance.check_inputs()

    elif element.name == 'img':
        alt_text = element['alt']
        source = element['src']
        Instance = ImageSlideCreator(element['src'])
        Instance.check_inputs()

    elif element.name == 'code':
        All = element.text.split('\n', 1)
        Language_and_title = All[0]
        # split the string into the language and the title except in the first line is only one word
        if " " not in Language_and_title:
            Language = Language_and_title
            title = "No title"
        else:
            Language, title = re.split(r'\s', All[0], 1)
        print("language", Language)
        print("title", title)
        Code = All[1]
        Instance = CodeSlideCreator(Language=Language, Code=Code)
        Instance.check_inputs()
    


HeaderImageCreator initialized
all Inputs are valid
language python
title No title
CodeSlideCreator initialized
all Inputs are valid
language typescript
title "hello world code"
CodeSlideCreator initialized
all Inputs are valid
ImageSlideCreator initialized
all Inputs are valid
ListCreator initialized
all Inputs are valid
HeaderImageCreator initialized
all Inputs are valid


In [30]:
import markdown
from bs4 import BeautifulSoup as bs

def parse_list_items(ul_element):
    """Helper function to properly parse nested list items"""
    items = []
    for li in ul_element.find_all('li', recursive=False):
        # Get the immediate text of the li element
        text = li.get_text(strip=True, separator=' ')
        # Check for nested lists
        nested_ul = li.find('ul')
        if nested_ul:
            nested_items = parse_list_items(nested_ul)
            items.append({
                'text': text.split('\n')[0],  # Main item text
                'subitems': nested_items
            })
        else:
            items.append({'text': text, 'subitems': []})
    return items

# open markdown file
with open('test.md', 'r') as f:
    data = f.read()

# parse markdown into html using extra features for nested lists
html = markdown.markdown(data, extensions=['extra'])

# parse markdown file
soup = bs(html, 'html.parser')

# loop through the html and create slides
all_elements = soup.find_all()
for element in all_elements:
    if element.name == 'h1':
        header_text = element.text
        Instance = HeaderSlideCreator(element.text)
        Instance.check_inputs()
    
    elif element.name == 'ul':
        # Parse the entire list structure
        list_items = parse_list_items(element)
        
        # Get the header (first item) and remaining items
        if list_items:
            header_list = list_items[0]['text']
            remaining_items = list_items[1:] if len(list_items) > 1 else []
            
            # Create instance with properly parsed items
            Instance = ListSlideCreator(header_list, remaining_items)
            Instance.check_inputs()
    
    elif element.name == 'img':
        alt_text = element['alt']
        source = element['src']
        Instance = ImageSlideCreator(element['src'])
        Instance.check_inputs()
    
    elif element.name == 'code':
        All = element.text.split('\n', 1)
        Language = All[0]
        Code = All[1]
        Instance = CodeSlideCreator(Language=Language, Code=Code)
        Instance.check_inputs()

HeaderImageCreator initialized
all Inputs are valid
CodeSlideCreator initialized
all Inputs are valid
ImageSlideCreator initialized
all Inputs are valid
ListCreator initialized
List items are empty
ListCreator initialized
all Inputs are valid
HeaderImageCreator initialized
all Inputs are valid


In [57]:
import markdown
from markdown.extensions import Extension
from markdown.blockprocessors import BlockProcessor
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET

class MyDivExtension(Extension):
    """
    Custom Markdown extension to wrap content in a <div> with the "mydiv" class.
    """
    class MyDivProcessor(BlockProcessor):
        def test(self, parent, block):
            return block.startswith('###mydiv\n')

        def run(self, parent, blocks):
            block = blocks.pop(0)
            div = ET.SubElement(parent, 'div')
            div.set('class', 'mydiv')
            self.parser.parseBlocks(div, [block[8:].strip()])

    def extendMarkdown(self, md):
        md.registerExtension(self)
        md.parser.blockprocessors.register(self.MyDivProcessor(md.parser), 'mydiv', 175)

# Usage
md_content = """
###mydiv
### The heading
- item 1
- item 2
- item 3

Some other content
"""
# open test.md
with open('test.md', 'r') as f:
    md_content = f.read()
    


# Parse Markdown to HTML with the custom extension
html = markdown.markdown(md_content, extensions=[MyDivExtension()])

# Parse the HTML using BeautifulSoup
soup = bs(html, 'html.parser')

# Print the final HTML structure
print(soup.prettify())

<h1>
 Create Venv
</h1>
<p>
 <code>
  python
print("Hello, world!")
for i in range(10):
    print(i)
 </code>
 <img alt="Cyan Blue Image" src="./Test_images/cyan_blue.png"/>
</p>
<div class="mydiv">
 <h3>
  List Header
 </h3>
 <ul>
  <li>
   first item
  </li>
  <li>
   second item
  </li>
  <li>
   third item
  </li>
 </ul>
</div>
<h1>
 pip install bs4
</h1>
<p>
 some more text
</p>



In [35]:
import markdown
from bs4 import BeautifulSoup as bs

def parse_list_items(ul_element):
    """Helper function to properly parse nested list items"""
    items = []
    for li in ul_element.find_all('li', recursive=False):
        item_text = li.get_text(strip=True)
        items.append(item_text)
    
    # Assume the first item is the header
    header = items[0] if items else ""
    
    # Remaining items are the actual list items
    list_items = items[1:] if len(items) > 1 else []
    
    return header, list_items

# open markdown file
with open('test.md', 'r') as f:
    data = f.read()

# parse markdown into html using extra features for nested lists
html = markdown.markdown(data, extensions=['extra'])

# parse markdown file
soup = bs(html, 'html.parser')

# loop through the html and create slides
all_elements = soup.find_all()
for element in all_elements:
    if element.name == 'h1':
        header_text = element.text
        Instance = HeaderSlideCreator(element.text)
        Instance.check_inputs()
    
    elif element.name == 'ul':
        # Get the list header and items
        header, list_items = parse_list_items(element)
        
        # Debug print
        print("\nParsed list:")
        print(f"Header: {header}")
        print(f"List items: {list_items}")
        
        # Create instance with properly parsed items
        Instance = ListSlideCreator(header, list_items)
        Instance.check_inputs()
    
    elif element.name == 'img':
        alt_text = element['alt']
        source = element['src']
        Instance = ImageSlideCreator(element['src'])
        Instance.check_inputs()
    
    elif element.name == 'code':
        All = element.text.split('\n', 1)
        Language = All[0]
        Code = All[1]
        Instance = CodeSlideCreator(Language=Language, Code=Code)
        Instance.check_inputs()

HeaderImageCreator initialized
all Inputs are valid
CodeSlideCreator initialized
all Inputs are valid
ImageSlideCreator initialized
all Inputs are valid

Parsed list:
Header: List Headerfirst itemsecond itemthird item
List items: []
ListCreator initialized
List items are empty

Parsed list:
Header: first item
List items: ['second item', 'third item']
ListCreator initialized
all Inputs are valid
HeaderImageCreator initialized
all Inputs are valid


In [15]:
string = "a string"
# check if string is a string
if type(string) == str:
    print("string is a string")

string is a string


In [21]:
# open markdown file
with open('test.md', 'r') as f:
    data = f.read()

# parse markdown into html
html = markdown.markdown(data)


# parse markdown file
soup = bs(html, 'html.parser')
print(soup.prettify())

# finf all code blocks
code_blocks = soup.find_all('code')
for code in code_blocks:
    print(code.text)
    code_lines = code.text.split('\n', 1)
    print(code_lines)
    print(len(code_lines))


# find all headers
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

# print headers
for header in headers:
    print(header.text)

# find all paragraphs
paragraphs = soup.find_all('p')

# print paragraphs
for paragraph in paragraphs:
    print(paragraph.text)

# find all lists
bulletpoints = soup.find_all(['ul', 'ol'])
bulletpoints = [item.text for item in bulletpoints]
print(bulletpoints)
# print lists
#for lst in lists:
#    for item in lst.find_all('li'):
#        print(item.text)

<h1>
 Create Venv
</h1>
<p>
 <code>
  python
print("Hello, world!")
for i in range(10):
    print(i)
 </code>
 <img alt="Cyan Blue Image" src="./Test_images/cyan_blue.png"/>
</p>
<ul>
 <li>
  List Header
  <ul>
   <li>
    first item
   </li>
   <li>
    second item
   </li>
   <li>
    third item
   </li>
  </ul>
 </li>
</ul>
<h1>
 pip install bs4
</h1>
<p>
 some more text
</p>

python
print("Hello, world!")
for i in range(10):
    print(i)
['python', 'print("Hello, world!")\nfor i in range(10):\n    print(i)']
2
Create Venv
pip install bs4
python
print("Hello, world!")
for i in range(10):
    print(i)

some more text
['\nList Header\nfirst item\nsecond item\nthird item\n\n\n', '\nfirst item\nsecond item\nthird item\n']


In [8]:
from bs4 import BeautifulSoup

# Sample HTML input
html_content = """
<ul>
 <li>
  List Header
  <ul>
   <li>first item</li>
   <li>second item</li>
   <li>third item</li>
  </ul>
 </li>
</ul>
"""

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Find the outer list item
outer_list_item = soup.find('ul').find('li')

# Extract the list header and items
list_header = outer_list_item.contents[0].strip()
items = [li.get_text(strip=True) for li in outer_list_item.find('ul').find_all('li')]

# Create the final list with sublists
final_list = [list_header, items]

# Print the result
print(final_list[0], final_list[1], sep='\n')


List Header
['first item', 'second item', 'third item']
