Extract Wiktionary page, select Latin section, and output part of speech information

In [None]:
import requests
from bs4 import BeautifulSoup

def search_wiktionary(word):
    """
    Fetch the HTML content of the Wiktionary page for the given word.

    Parameters:
    word (str): The word to search for on Wiktionary.

    Returns:
    BeautifulSoup object or str: Parsed HTML content or None if an error occurs.
    """
    url = f"https://en.wiktionary.org/wiki/{word}"
    response = requests.get(url)
    if response.status_code != 200:
        return None
    return BeautifulSoup(response.content, 'html.parser')


def extract_latin_content(soup):
    """
    Extract content under the Latin header on a Wiktionary page.

    Parameters:
        soup (BeautifulSoup): Parsed HTML content of a Wiktionary page.

    Returns:
        str: Extracted content between the 'Latin' header and the next 'h2' header.
    """
    # Find the <h2> tag with id='Latin'
    latin_header = soup.find('h2', {'id': 'Latin'})

    if not latin_header:
        return "Latin header not found"

    # Initialize an empty list to store the content
    content = []

    # Use find_all_next to iterate over all elements after the 'Latin' header
    for element in latin_header.find_all_next():
        # Stop if another <h2> tag is found (indicating the start of a new section)
        if element.name == 'h2' and element.get('id') != 'Latin':
            break
        # Append the element to the content list
        content.append(str(element))

    # Remove the first element if it's the 'Latin' header itself
    if content and str(latin_header) == content[0]:
        content.pop(0)

    # Join the list into a single string and return the extracted content
    return ''.join(content)



def get_pos(word):
    """
    Extract the part of speech from the Latin section of the Wiktionary page.

    Parameters:
    word (str): The word to search for on Wiktionary.

    Returns:
    str, list, or None: The part of speech of the word, or None if not found.
                        If there are two entries, a list of both is returned.
    """
    # Fetch the Wiktionary page for the word
    soup = search_wiktionary(word)

    if soup is None:
        return None  # Return None if the page is not found

    # Extract the Latin content from the page
    latin_content = extract_latin_content(soup)

    if "Latin header not found" in latin_content:
        return None  # Return None if the Latin section is missing

    several = []

    # Parse the Latin section for PoS information
    content_soup = BeautifulSoup(latin_content, 'html.parser')
    for section in content_soup.find_all(['h3', 'h4']):
        header_text = section.get_text(strip=True)
        if header_text in ["Noun", "Verb", "Adjective", "Conjunction", "Preposition", "Adverb", "Numeral", "Article", "Pronoun"]:
            several.append(header_text)

    several = list(dict.fromkeys(several))

    # If two entries are found, return both as a list
    if len(several) == 1:
        return several[0]  # Return a single entry if there's only one
    elif len(several) == 2:
        return several  # Return both entries if there are two
    elif len(several) > 2:
        return several[0]  # Return the first entry if there are more than two

    return None  # Return None if no PoS is found






def search_cactus(word, PoS):
    """
    Fetch the HTML content of the Cactus page for the given word, according to Part of Speech.

    Parameters:
    word (str): The word to search for on Cactus.
    PoS (str): Part of Speech of the word, determining which url to search.

    Returns:
    BeautifulSoup object or str: Parsed HTML content or None if an error occurs.
    """
    url = None
    if PoS == "Noun":
        url = f"https://latin.cactus2000.de/noun/shownoun_en.php?n={word}&form={word}"
    elif PoS == "Adjective":
        url = f"https://latin.cactus2000.de/adject/showadj_en.php?n={word}&form={word}"
    elif PoS == "Adverb":
        url = f"https://latin.cactus2000.de/praepos/showadv_en.php?n={word}"
    elif PoS == "Conjunction":
        url = f"https://latin.cactus2000.de/praepos/showadv_en.php?n={word}"
    elif PoS == "Verb":
        url = f"https://latin.cactus2000.de/showverb.en.php?verb={word}&form={word}"
    elif PoS == "Preposition":
        url = f"https://latin.cactus2000.de/praepos/showadv_en.php?n={word}"
    elif PoS == "Pronoun":
        url = f"https://latin.cactus2000.de/pronom/showpronom_en.php?n={word}&form={word}"
    elif PoS == "Numeral":
        url = f"https://latin.cactus2000.de/adject/showadj_en.php?n={word}&form={word}"

    if url:
        response = requests.get(url)
        if response.status_code != 200:
            return None
        return BeautifulSoup(response.content, 'html.parser')
    else:
        return None


def get_translation(word, PoS):
    """
    Extract the first English translation from the Latin section of the Wiktionary page.

    Parameters:
    word (str): The word to search for on Wiktionary.
    pos (str): The part of speech of the word.

    Returns:
    str: The first English translation or an error message.
    """
    if PoS is None:
        return None  # Return a default message if PoS is None

    # Fetch and parse the Latin section of the Wiktionary page
    soup = search_cactus(word, PoS)
    if soup is None:
        return f"No page for {word} found on cactus"

    # Translation retrieval logic here...
    # This part remains unchanged, where it handles each part of speech case.


    if PoS in ["Noun", "Pronoun"]:
        # First, attempt to find the English translation
        english_label = soup.find('b', string="English:")
        if english_label:
            translation_tag = english_label.find_next('font')
            if translation_tag:
                return translation_tag.get_text(strip=True)

        # If no English translation is found, try to find the German translation
        german_label = soup.find('b', string="deutsch:")
        if german_label:
            translation_tag = german_label.find_next('font')
            if translation_tag:
                return translation_tag.get_text(strip=True)

    elif PoS == "Adjective":
        # Find the <td> containing "English" and get the next <td> containing the translation
        english_label = soup.find('td', class_="tablink", string="English")
        if english_label:
            translation_td = english_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=False)

        # Fallback to German translation if English is not available
        german_label = soup.find('td', class_="tablink", string="German")
        if german_label:
            translation_td = german_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=False)

    elif PoS == "Verb":
        # Find the <td> containing "English" and get the next <td> containing the translation
        english_label = soup.find('td', class_="tablink", string="English")
        if english_label:
            translation_td = english_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=False)

    # Fallback to German translation if English is not available
        german_label = soup.find('td', class_="tablink", string="German")
        if german_label:
            translation_td = german_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=False)

    elif PoS == "Conjunction":
        # Find the <div> containing "Conjunction:" and get the next <div> with the translation
        conjunction_label = soup.find('div', class_="timp1", string="Conjunction:")
        if conjunction_label:
            translation_div = conjunction_label.find_next_sibling('div', style="text-indent:2em;")
            if translation_div:
                return translation_div.get_text(strip=True)

    elif PoS == "Preposition":
        # Initialize an empty list to store case-based translations
        translations = []

        # Find the accusative translations
        accusative_label = soup.find('div', class_="tima1", string="Preposition with accusative:")
        if accusative_label:
            accusative_translation_div = accusative_label.find_next_sibling('div', style="text-indent:2em;")
            if accusative_translation_div:
                translations.append(f"(+acc) {accusative_translation_div.get_text(strip=True)}")

        # Find the ablative translations
        ablative_label = soup.find('div', class_="tima2", string="Preposition with ablative:")
        if ablative_label:
            ablative_translation_div = ablative_label.find_next_sibling('div', style="text-indent:2em;")
            if ablative_translation_div:
                translations.append(f"(+abl) {ablative_translation_div.get_text(strip=True)}")

        # Join translations if any found, otherwise return no translation message
        if translations:
            return "; ".join(translations)

    elif PoS == "Adverb":
        # Find the <div> with class "timz1" and text "Adverb:", then get the next <div> with the translation
        adverb_label = soup.find('div', class_="timz1", string="Adverb:")
        if adverb_label:
            translation_div = adverb_label.find_next_sibling('div', style="text-indent:2em;")
            if translation_div:
                return translation_div.get_text(strip=True)

    elif PoS == "Numeral":
        english_label = soup.find('td', class_="tablink", string="English")
        if english_label:
            translation_td = english_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=True)
        # Fallback to German translation if English is not available
        german_label = soup.find('td', class_="tablink", string="German")
        if german_label:
            translation_td = german_label.find_next_sibling('td', class_="tablink")
            if translation_td:
                return translation_td.get_text(strip=True)

    # If no translation is found
    return "%%% Nothing found %%%"

def get_gender(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Noun")
    if soup is None:
        return f"No page for {word} found on cactus"

    # Find the <h1> tag containing the lemma information
    lemma_label = soup.find('h1', {'translate': 'no'})
    if lemma_label:
        latin_text = lemma_label.get_text(strip=True)
        latin_list = [item.strip() for item in latin_text.split(',')]

        # Handle cases with 4 items (e.g., masculine and feminine)
        if len(latin_list) == 4:
            return "c"
        # Handle cases with 3 items (e.g., single gender)
        elif len(latin_list) == 3:
            return latin_list[-1]

    # If nothing is found
    return "%%% Nothing found %%%"


def get_declension_noun(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Noun")
    if soup is None:
        return f"No page for {word} found on cactus"

    # Find the <h3> tag containing the declension information
    declension_label = soup.find('h3', {'translate': 'no'})
    if declension_label:
        declension_text = declension_label.get_text(strip=True).lower()
        declension_list = declension_text.split(' ')

        # Check for a declension indicator (e.g., "first declension")
        declension_word = declension_list[0]
        declension_map = {
            "first": 1,
            "second": 2,
            "third": 3,
            "fourth": 4,
            "fifth": 5
        }

        # Return the declension number if found in the map
        if declension_word in declension_map:
            return declension_map[declension_word]

    # If nothing is found
    return "%%% Nothing found %%%"



def get_declension_adjective(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Adjective")
    if soup is None:
        return f"No page for {word} found on cactus"

    # Find the <h4> tag containing the declension information
    declension_label = soup.find('h4', {'translate': 'no'})
    if declension_label:
        declension_text = declension_label.get_text(strip=True).lower()

        # Determine declension based on the specific phrases
        if "1st & 2nd declension" in declension_text:
            return 2
        elif "3rd declension" in declension_text:
            return 3

    # If nothing is found
    return "%%% Nothing found %%%"


def get_genitive_noun(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Noun")
    if soup is None:
        return f"No page for '{word}' found on cactus"

    # Find the <h1> tag containing the lemma information
    lemma_label = soup.find('h1', {'translate': 'no'})
    if lemma_label:
        latin_text = lemma_label.get_text(strip=True)
        latin_list = [item.strip() for item in latin_text.split(',')]

        # Check if the genitive is in the expected position
        if len(latin_list) > 1:
            genitive = latin_list[1]
            return genitive

    # If nothing is found
    return "%%% Nothing found %%%"


def get_genitive_adjective(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Adjective")
    if soup is None:
        return f"No page for '{word}' found on cactus"

    # Locate the table containing the declension information
    table = soup.find('table', {'width': '90%', 'translate': 'no'})
    if table:
        # Find the row where the first cell contains "Gen."
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if cells and cells[0].get_text(strip=True) == "Gen.":
                # Extract the genitive form from the second cell (m/f genitive)
                genitive_form = cells[1].get_text(strip=True)
                return genitive_form

    # If nothing is found
    return "%%% Nothing found %%%"


def get_verbinfo(word):
    # Fetch Cactus page
    soup = search_cactus(word, "Verb")
    if soup is None:
        return f"No page for '{word}' found on cactus"

    # Locate the <h5> tag with principal parts and conjugation info
    h5_tag = soup.find('h5', {'translate': 'no'})

    if h5_tag is None:
        return f"No principal parts found for '{word}'"

    # Extract the text from the <h5> tag
    principal_parts_text = h5_tag.get_text(strip=True)

    # Split the principal parts by comma to return a list of parts
    principal_parts = principal_parts_text.split(',')

    # Clean up any whitespace around each part and further split by spaces
    cleaned_principal_parts = []
    for part in principal_parts:
        # Strip whitespace and split further by spaces
        sub_parts = part.strip().split()
        cleaned_principal_parts.extend(sub_parts)

    # Extract conjugation info
    conjugation = cleaned_principal_parts[-1]

    # Exclude the last entry in the list (i.e., the conjugation information)
    cleaned_principal_parts = cleaned_principal_parts[:-1]

    # Join all parts into a single string
    final_principal_parts = ' '.join(cleaned_principal_parts)

    return [final_principal_parts, conjugation]


In [None]:
print(search_wiktionary("amo"))

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>amo - Wiktionary, the free dictionary</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabl

In [None]:
# Playground to test functions above
word = 'acer'
#word_list_2 = ["mors", "manus", "bellum", "nauta", "dies", "vehemens", "admodum", "et", "ad", "sub", "decem"]

#for word in word_list_2:
#    POS = get_pos(word)
#    print(get_translation(word, POS))
#PoS = get_pos(word)
get_pos(word)

['Adjective', 'Noun']

--------------------------------------------------------------------------------

--------------------------------------------------------------------------------

Read Excel file, get PoS etc. for every word, write new Excel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Assumed word information functions defined here (search_wiktionary, get_pos, search_cactus, etc.)

def scan_excel(input_file_path):
    """
    Read words from the Excel file.

    Parameters:
    input_file_path (str): Path to the input Excel file.

    Returns:
    list: A list of words from the Excel file.
    """
    df = pd.read_excel(input_file_path, header=None)
    words = df[0].tolist()  # Assuming words are in the first column
    return words


def generate_word_data(word):
    """
    Generate a dictionary containing the word data including part of speech,
    declension, genitive, gender, and translation.

    Parameters:
    word (str): The word to gather information for.

    Returns:
    list: A list of dictionaries containing the word data for each part of speech.
    """
    pos = get_pos(word)
    if pos is None:
        return [{
            'Part of Speech': None,
            'Declension': None,
            'Word': word,
            'Genitive': None,
            'Gender': None,
            'Translation': None,
            'Conjugation': None,
            'Principal Parts': None,
        }]

    # Ensure pos is a list for uniform handling
    if isinstance(pos, str):
        pos = [pos]

    word_data_list = []

    # Iterate through each part of speech in the list
    for pos_entry in pos:
        declension, genitive, gender, conjugation, pp = None, None, None, None, None

        if pos_entry == "Noun":
            declension = get_declension_noun(word)
            genitive = get_genitive_noun(word)
            gender = get_gender(word)
        elif pos_entry == "Adjective":
            declension = get_declension_adjective(word)
            genitive = get_genitive_adjective(word)
        elif pos_entry == "Verb":
            conjugation = get_verbinfo(word)[1]
            pp = get_verbinfo(word)[0]
        # For other parts of speech, no declension, genitive, or gender is needed

        translation = get_translation(word, pos_entry)

        word_data = {
            'Part of Speech': pos_entry,
            'Declension': declension,
            'Word': word,
            'Genitive': genitive,
            'Gender': gender,
            'Translation': translation,
            'Conjugation': conjugation,
            'Principal Parts': pp,
        }

        word_data_list.append(word_data)

    return word_data_list



def categorize_words(words):
    """
    Generate a list of word data dictionaries for each word.

    Parameters:
    words (list of str): A list of words to gather information for.

    Returns:
    list: A list of dictionaries containing word data.
    """
    expanded_words = []
    for word in words:
        word_data_list = generate_word_data(word)
        expanded_words.extend(word_data_list)  # Add all the dictionaries for each word
    return expanded_words


def write_to_excel(flattened_words, output_file_path):
    """
    Write word data to an Excel file.

    Parameters:
    flattened_words (list of dict): A list of word data dictionaries.
    output_file_path (str): The output Excel file path.
    """
    df = pd.DataFrame(flattened_words)
    df = df[['Part of Speech', 'Declension', 'Word', 'Genitive', 'Gender', 'Translation', 'Conjugation', 'Principal Parts']]  # Rearrange columns
    df.to_excel(output_file_path, index=False)



In [None]:
# Main execution
input_file_path = '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet/Z.xlsx'
output_excel_path = '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet_sorted/Z_sorted.xlsx'
#output_pdf_path = '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/input_words_sorted.pdf'

# Test with local word list
#word_list_1 = ["qui", "acer", "amare", "videre"]
#categorized_words = categorize_words(word_list_1)
#write_to_excel(categorized_words, '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/test_1.xlsx')

# Step 1: Read words from Excel
words = scan_excel(input_file_path)

# Step 2: Categorize and sort the words
flattened_words = categorize_words(words)

# Step 3: Write the categorized words to Excel
write_to_excel(flattened_words, output_excel_path)

print("File generated successfully.")

File generated successfully.


--------------------------------------------------------------------------------

--------------------------------------------------------------------------------

Write to PDF

In [None]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=9d88aa556ae9ee9ab1af20ee9f72b0c869843ca9ee7580934c68de56fcd75f50
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
!pip install XlsxWriter

Collecting XlsxWriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/159.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter
Successfully installed XlsxWriter-3.2.0


In [None]:
import pandas as pd
from fpdf import FPDF

# Class for PDF generation using fpdf
class PDF(FPDF):
    def header(self):
        self.set_font("DejaVu", "B", 12)
        self.cell(0, 10, "Vocab Kennedy", 0, 1, "C")
        self.ln(5)

    def chapter_title(self, title):
        self.set_font("DejaVu", "B", 14)
        self.cell(0, 10, title, 0, 1, "L")
        self.ln(5)

    def chapter_body(self, body):
        self.set_font("DejaVu", "", 10)
        self.multi_cell(0, 10, body)
        self.ln(5)

    def add_table_row(self, col1, col2):
        self.set_font("DejaVu", "", 10)
        self.cell(95, 7, col1, 1)
        self.cell(95, 7, col2, 1)
        self.ln()

def sort_words(input_file_path):
    """
    Read words from an Excel file and sort them by specified criteria.

    Parameters:
    input_file_path (str): Path to the input Excel file.

    Returns:
    DataFrame: Sorted DataFrame containing word data.
    """
    # Read the Excel file into a DataFrame
    df = pd.read_excel(input_file_path)
    # Convert 'Declension' and 'Conjugation' columns to string to avoid float representation
    df['Declension'] = df['Declension'].apply(lambda x: str(int(x)) if pd.notna(x) else '')
    df['Conjugation'] = df['Conjugation'].apply(lambda x: str(x).strip() if pd.notna(x) else '')

    # Remove parentheses and trailing dots from 'Conjugation' column values
    df['Conjugation'] = df['Conjugation'].replace(r'\((.*?)\)\.?', r'\1', regex=True)
    df['Conjugation'] = df['Conjugation'].replace(r'\.$', '', regex=True)

    # Define a sorting order for parts of speech
    pos_order = [
        'Noun', 'Adjective', 'Adverb', 'Conjunction',
        'Preposition', 'Numeral', 'Verb'
    ]

    # Sort nouns by declension and gender
    noun_declension_order = [1, 2, 3, 4, 5]
    gender_order = ['m', 'c', 'f', 'n']
    df['Noun Declension Rank'] = df.apply(lambda x: noun_declension_order.index(int(float(x['Declension']))) if x['Part of Speech'] == 'Noun' and int(float(x['Declension'])) in noun_declension_order else len(noun_declension_order), axis=1)
    df['Gender Rank'] = df.apply(lambda x: gender_order.index(x['Gender']) if x['Part of Speech'] == 'Noun' and x['Gender'] in gender_order else len(gender_order), axis=1)

    # Sort adjectives by declension
    adjective_declension_order = [2, 3]
    df['Adjective Declension Rank'] = df.apply(lambda x: adjective_declension_order.index(int(float(x['Declension']))) if x['Part of Speech'] == 'Adjective' and int(float(x['Declension'])) in adjective_declension_order else len(adjective_declension_order), axis=1)

    # Sort verbs by conjugation
    verb_conjugation_order = ['irr', 'dep', '1', '2', '3', '4']
    df['Verb Conjugation Rank'] = df.apply(lambda x: verb_conjugation_order.index(x['Conjugation']) if x['Part of Speech'] == 'Verb' and x['Conjugation'] in verb_conjugation_order else len(verb_conjugation_order), axis=1)

    # Sort by Part of Speech using the defined order
    df['POS Rank'] = df['Part of Speech'].map(lambda x: pos_order.index(x) if x in pos_order else len(pos_order))

    # Sort by additional criteria within each Part of Speech category
    df_sorted = df.sort_values(
        by=['POS Rank', 'Noun Declension Rank', 'Gender Rank', 'Adjective Declension Rank', 'Verb Conjugation Rank', 'Word'],
        ascending=[True, True, True, True, True, True]
    )

    # Drop the helper columns used for sorting
    df_sorted = df_sorted.drop(columns=['POS Rank', 'Noun Declension Rank', 'Gender Rank', 'Adjective Declension Rank', 'Verb Conjugation Rank'])

    return df_sorted


def write_to_pdf(flattened_words, output_file_path):
    pdf = PDF()

    # Load DejaVu fonts to support special characters
    pdf.add_font('DejaVu', '', '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/fonts/DejaVuSans.ttf', uni=True)
    pdf.add_font('DejaVu', 'B', '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/fonts/DejaVuSans-Bold.ttf', uni=True)

    pdf.add_page()

    # Group words by Part of Speech
    pos_groups = {}
    for word_info in flattened_words:
        pos = word_info.get("Part of Speech")
        if pos not in pos_groups:
            pos_groups[pos] = []
        pos_groups[pos].append(word_info)

    # Iterate through each Part of Speech and create a table for it
    for pos, words in pos_groups.items():
        pdf.chapter_title(pos)

        current_declension = None
        for word_info in words:
            word = word_info.get("Word", "")
            translation = word_info.get("Translation", "")

            # Take only the first three items of the translation string, if they exist
            translation_parts = translation.split(',')
            translation = ','.join(translation_parts[:3])

            declension = word_info.get("Declension", "")
            genitive = word_info.get("Genitive", "")
            gender = word_info.get("Gender", "")
            principal_parts = word_info.get('Principal Parts', '')
            conjugation = word_info.get('Conjugation', '')

            # Insert a blank line between declensions for Nouns and Adjectives
            if pos in ["Noun", "Adjective"]:
                if declension != current_declension:
                    if current_declension is not None:
                        pdf.ln(3)
                    current_declension = declension

            # Format row content based on Part of Speech
            if pos == "Noun":
                if declension in ['3', '4', '5']:
                    row = f"{declension} | {gender} | {word}, {genitive} "
                else:
                    row = f"{declension} | {gender} | {word} "
                pdf.add_table_row(row, translation)
            elif pos == "Adjective":
                if declension == '3':
                    row = f"{declension} | {word}, {genitive}"
                else:
                    row = f"{declension} | {word}"
                pdf.add_table_row(row, translation)
            elif pos == "Verb":
                row = f"{conjugation} | {principal_parts}"
                pdf.add_table_row(row, translation)
            elif pos == "Preposition":
                row = f"{word}"
                pdf.add_table_row(row, translation)
            else:
                row = word
                pdf.add_table_row(row, translation)

        pdf.ln(10)

    pdf.output(output_file_path)



In [None]:
# Example usage
input_file = '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet_sorted/S_sorted.xlsx'
output_file = '/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet_sorted/S_sorted.pdf'


sorted_df = sort_words(input_file)
flattened_words = sorted_df.to_dict(orient='records')
write_to_pdf(flattened_words, output_file)

In [None]:
# Batch execution

alph_1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "L", "M"]
alph_2 = ["N", "O", "P", "Q", "R", "S", "T", "U", "V", "X", "Z"]

for letter in alph_2:
    input_file = f'/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet_sorted/{letter}_sorted.xlsx'
    output_file = f'/content/drive/MyDrive/Colab_Notebooks/LatinVocab/Alphabet_sorted/{letter}_sorted.pdf'

    sorted_df = sort_words(input_file)
    flattened_words = sorted_df.to_dict(orient='records')
    write_to_pdf(flattened_words, output_file)

KeyError: 'Part of Speech'