In [18]:
import nltk
import re
from nltk.corpus import cmudict

# Download CMU dictionary
nltk.download('cmudict')

# Load the CMU Pronouncing Dictionary
cmu_dict = cmudict.dict()

# Generalized regex patterns for syllable counting exceptions

# Rule 1: Silent "e" or silent vowels at the end (e.g., fire, hire, hour)
silent_e_general = re.compile(r'.*[aeiou]r[aeiou]*e$')  # Words with vowel + 'r' before 'e'

# Rule 2: Irregular diphthongs or unusual vowel patterns (e.g., choir, colonel)
irregular_diphthongs = re.compile(r'(oi|ou|lo[^n]|ea[^s])')  # Covers "oi" as in "choir" and "lo" in "colonel"

# Rule 3: Silent consonants or dropped vowels (e.g., wednesday, clothes, every)
silent_consonants = re.compile(r'(wed.*day|clothes|ev.*ry)')  # Patterns where consonants or vowels are dropped

# Diphthong patterns (e.g., ai, ea, ou, etc.)
diphthong_patterns = re.compile(r'(ai|au|ea|ei|ou|ie|ue|oa)')  # Common diphthongs

# Suffix -le pattern (e.g., table, bottle)
suffix_le_pattern = re.compile(r'(.*[^aeiouy]le$)')  # Words ending in "le" preceded by a consonant

# Vowel group exceptions (e.g., poetry, chaos)
vowel_group_exceptions = re.compile(r'(.*[aeiou]{2,})')  # Words with complex vowel groupings

# Compound words (e.g., reenter, preexisting)
compound_word_pattern = re.compile(r'(reenter|preexisting)')  # Compound words requiring special handling



[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Lardex\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [19]:
def count_syllables(word):
    """Count syllables using a combination of regex and CMU dictionary."""
    word = word.lower()

    ### STEP 1: Handle Special Cases with Regex Patterns ###
    
    # Rule 1: Handle silent "e" or silent vowel patterns
    if silent_e_general.match(word):
        return 1

    # Rule 2: Handle irregular diphthongs or vowel combinations
    if irregular_diphthongs.search(word):
        return 1

    # Rule 3: Handle words with silent consonants or dropped vowels
    if silent_consonants.match(word):
        if 'wednesday' in word:
            return 1
        elif 'clothes' in word:
            return 1
        elif 'every' in word:
            return 2

    # Handle compound words like "reenter" and "preexisting"
    if compound_word_pattern.search(word):
        if 'reenter' in word:
            return 3
        elif 'preexisting' in word:
            return 4

    ### STEP 2: Handle General Syllable Counting with Regex ###
    
    # General rule: Count vowel groups
    syllable_count = len(re.findall(r'[aeiouy]+', word))

    # Handle diphthongs (ai, ea, etc.), subtracting for each occurrence
    syllable_count -= len(diphthong_patterns.findall(word))

    # Handle silent "e" cases, but check exceptions (e.g., acre, ballet)
    if word.endswith('e') and not re.match(r'.*[aeiou][^aeiouy][^aeiou][^aeiou]e$', word):
        syllable_count -= 1

    # Handle final "le" (like table, bottle)
    if suffix_le_pattern.match(word):
        syllable_count += 1

    # Handle complex vowel group exceptions (e.g., poetry, chaos)
    if vowel_group_exceptions.match(word):
        if 'chaos' in word or 'poetry' in word:
            syllable_count += 1

    # Ensure at least 1 syllable
    if syllable_count > 0:
        return syllable_count

    ### STEP 3: Fallback to CMU Pronouncing Dictionary ###
    
    if word in cmu_dict:
        # Count the number of vowel sounds in the phonetic transcription
        return [len([phoneme for phoneme in pron if phoneme[-1].isdigit()]) for pron in cmu_dict[word]][0]

    # If no match, return 1 syllable as the fallback
    return max(1, syllable_count)
words_to_test = [
    # Silent "e" and Irregular Endings
    "fire", "hire", "acre", "table",
    
    # Diphthongs and Triphthongs
    "audio", "queue", "chaos", "poetry",
    
    # Consonant Clusters
    "rural", "squirrel",
    
    # Uncommon Vowel Combinations
    "colonel", "choir", "hour",
    
    # Words with Uncommon Endings
    "people", "apple", "ballet",
    
    # Prefix and Suffix Issues
    "reenter", "preexisting", "misunderstood",
    
    # Short Irregular Words
    "eye", "i", "you",
    
    # Hyphenated and Compound Words
    "mother-in-law", "check-in", "high-school",
    
    # Foreign Borrowings
    "genre", "debris", "faux", "bureau",
    
    # Special Cases and Exceptions
    "one", "wednesday", "clothes", "every"
]

# for word in words_to_test:
#     print(f"Word: {word}, Syllables: {count_syllables(word)}")

In [21]:
import pandas as pd

def process_words_file(input_file, output_file):
    """
    Reads a file of words, applies the count_syllables function to each word, 
    and saves the results as a CSV file.
    
    :param input_file: Path to the file containing words (one word per line)
    :param output_file: Path to save the resulting CSV file
    """
    # Read the input file into a list of words
    with open(input_file, 'r') as file:
        words = [line.strip() for line in file]

    # Apply the count_syllables function to each word
    syllable_counts = [count_syllables(word) for word in words]

    # Create a DataFrame with the words and their syllable counts
    df = pd.DataFrame({
        'word': words,
        'syllable_count': syllable_counts
    })

    # Save the DataFrame as a CSV file
    df.to_csv(output_file, index=False)

process_words_file('../words_alpha.txt', 'syllable_data.csv')

In [None]:
import pyodbc as odbc
import os
from dotenv import load_dotenv

load_dotenv()
SERVER = os.getenv('SERVER')
DATABASE = os.getenv('DATABASE')
# PASS = os.getenv('PASSWORD')
# USER = os.getenv('USER')

connectionString = f"""
DRIVER={{SQL Server}};
SERVER={SERVER};
DATABASE={DATABASE};
Trusted_Connection=yes;
"""
# pwd={PASS};
# uid={USER};
# Connect to SQL Server database
conn = odbc.connect(connectionString)
print(conn)