In [33]:
epub_path = '../../Resources/مجنون ليلى.epub'
book_title = 'مجنون ليلى'
example_xhtml = "chapter-1-2-L.xhtml"

## This function is general for any EPUB file

In [34]:
import zipfile
from bs4 import BeautifulSoup
import os

def extract_and_save_chapters(epub_path, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    with zipfile.ZipFile(epub_path, 'r') as zip_ref:
        zip_ref.extractall("temp_epub_extraction")

    chapter_counter = 1
    for root, dirs, files in sorted(os.walk("temp_epub_extraction")):
        for file in sorted(files):
            if file.endswith('.html') or file.endswith('.xhtml'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    soup = BeautifulSoup(f, 'lxml')
                    texts = soup.get_text(separator='\n', strip=True)
                    chapter_filename = f"chapter_{chapter_counter}.txt"
                    chapter_path = os.path.join(output_directory, chapter_filename)
                    with open(chapter_path, 'w', encoding='utf-8') as chapter_file:
                        chapter_file.write(texts)
                    chapter_counter += 1
    
    # Cleanup extraction directory
    for root, dirs, files in os.walk("temp_epub_extraction", topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dirs:
            os.rmdir(os.path.join(root, name))
    os.rmdir("temp_epub_extraction")

# Specify the directory where you want to save the chapters
output_directory = f'./{book_title}_chapters'

extract_and_save_chapters(epub_path, output_directory)

  soup = BeautifulSoup(f, 'lxml')


In [35]:
from lxml import etree

# Load the XHTML content from the file
with open(example_xhtml, "rb") as file:
    # Parse the XHTML
    tree = etree.parse(file)


# Use list comprehension to get all elements
    ## it checks if the element has text and if the text is not empty (whitespace)
elements_list = [element for element in tree.iter() if element.text and
                element.text.strip() and
                element.text.strip() != ':' and
                element.tag != '{http://www.w3.org/1999/xhtml}a']

characters_names = list(set([element.text for element in elements_list if element.tag == '{http://www.w3.org/1999/xhtml}b']))

In [36]:
speakers = characters_names  # Presumed to be defined earlier
speakers.append('المعلق')

current_speaker = 'المعلق'
combined_dialogue = ""
last_speaker = None

# A list of tuples that will store the speaker and the dialogue
dialogues_and_speakers = []

for element in elements_list:
    cleaned_text = element.text.strip()  # Clean up the text for processing

    # Check if the element's tag indicates a speaker
    if element.tag == '{http://www.w3.org/1999/xhtml}b':
        if cleaned_text in speakers:
            # Announce the speaker's name by the narrator before the speaker's dialogue
            dialogues_and_speakers.append((cleaned_text, "المعلق"))
            current_speaker = cleaned_text  # Update the current speaker
        else:
            current_speaker = 'المعلق'  # Reset to narrator if the speaker is not recognized
    elif cleaned_text.startswith("(") and cleaned_text.endswith(")"):
        # Special handling for narrative text
        dialogues_and_speakers.append((cleaned_text, "المعلق"))
        # Reset the current speaker to the narrator
        dialogues_and_speakers.append(("", current_speaker))
    else:
        # Normal dialogue
        if current_speaker == last_speaker:
            # If the current speaker is the same as the last, combine the dialogue lines
            last_dialogue, last_speaker_in_list = dialogues_and_speakers[-1]
            combined_dialogue = last_dialogue + " " + cleaned_text
            dialogues_and_speakers[-1] = (combined_dialogue, last_speaker_in_list)
        else:
            # If the speaker changes, just append the dialogue
            dialogues_and_speakers.append((cleaned_text, current_speaker))
        last_speaker = current_speaker  # Update the last_speaker to the current one

# remove any empty dialogues
dialogues_and_speakers = [(dialogue, speaker) for dialogue, speaker in dialogues_and_speakers if dialogue.strip()]

# Printing dialogues and their speakers
for dialogue, speaker in dialogues_and_speakers:
    print(f"{speaker}: {dialogue}\n")

# save this list to a file as a npy
import numpy as np

np.save('dialogues_and_speakers.npy', dialogues_and_speakers)

المعلق: مجنون ليلى الفصل الأول

المعلق: (ساحة أمام خيام المهدي في حي بني عامر - مجلس من مجالس السمر في هذه
            الساحة - فتية وفتيات من الحي يسمرون في أوائل الليل، وفي أيدي الفتيات صوف ومغازل يلهون
            بها وهم يتحدثون — تخرج ليلى من خيام أبيها عند ارتفاع الستار ويدها في يد ابن ذريح)

المعلق: ليلى

ليلى: دعي الغزْلَ سلمى وحَيِّي معي منارَ الحِجَازِ فَتَى يَثْرِبِ

المعلق: (تصافحه سلمى)

ليلى:  ويا هِنْدُ هذا أديبُ الحِجازِ هلمِّي بمَقْدَمِهِ رَحِّبِي

المعلق: (تصافحه هند ويحتفي به السامرون)

المعلق: سعد

سعد: أمن يثربٍ أنت آتٍ؟

المعلق: ابن ذريح

ابن ذريح: أجل من البلدِ القُدُس الطيِّب

المعلق: ليلى

ليلى: أيابنَ ذَريحٍ لقينا الغمام

المعلق: هند

هند: وطَافتْ بنا نَفَحَاتُ النبي

المعلق: عبلة

المعلق: (هامسة إلى سعد)

عبلة: مَن ابْنُ ذَريحٍ؟

المعلق: سعد

سعد: فتًى ذِكرُه على مَشرِق الشمس والمغرب رَضيعُ الحُسَيْنِ عليه السلامُ وترْبُ الحُسَيْنِ من المكتبِ

المعلق: عبلة

المعلق: (إلى بشر ومشير إلى ابن ذريح)

عبلة: أتسمَعُ بشرُ رضيعُ الْحُسَيْنِ فديْتُ الرضيعيْن والمُرضعهْ 