-
Notifications
You must be signed in to change notification settings - Fork 0
Add count_syllables to ch08 #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1fd6af7
87fb3b1
7486627
3876c4f
941fe04
4cc5d4e
a1edcc5
3dc409a
56576bb
2a3ac2a
e65fac8
e0a9d50
321281e
8519175
ae8fde9
34b6b61
6f37d7c
bd92303
52c96a0
fc3663a
d2c3f56
b99cc36
4a73bd4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
src.ch08 package | ||
================ | ||
|
||
Submodules | ||
---------- | ||
|
||
src.ch08.p1\_count\_syllables module | ||
------------------------------------ | ||
|
||
.. automodule:: src.ch08.p1_count_syllables | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: | ||
|
||
|
||
Module contents | ||
--------------- | ||
|
||
.. automodule:: src.ch08 | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ Subpackages | |
src.ch05 | ||
src.ch06 | ||
src.ch07 | ||
src.ch08 | ||
|
||
Module contents | ||
--------------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
python-docx==0.8.10 | ||
python-docx==0.8.10 | ||
nltk==3.4.5 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Chapter 8.""" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
"""Test count_syllables with a word dictionary file. | ||
Randomly select words from a word dictionary file and pass them through | ||
:func:`count_syllables` to find their syllable counts. Output each word with | ||
their respective syllable count. | ||
Attributes: | ||
CMUDICT (dict): Dictionary of CMUdict's phonemes with the word as a key | ||
and its phonemes as a list of lists. | ||
MISSING_WORDS (dict): Dictionary with syllable counts of words | ||
missing from CMUdict's phoneme list where the word is the key and | ||
its syllable count as an integer value. | ||
""" | ||
import json | ||
import os | ||
from random import sample | ||
from string import punctuation | ||
|
||
import nltk | ||
from nltk.corpus import cmudict | ||
|
||
from src.ch02 import DICTIONARY_FILE_PATH | ||
from src.ch02.p1_cleanup_dictionary import cleanup_dict | ||
|
||
if not os.path.exists( | ||
os.path.expanduser('~/nltk_data/corpora/cmudict/cmudict')): | ||
# pylint: disable=fixme | ||
# FIXME: This is nearly impossible to test. | ||
# Patching os affects every use of os in the module. | ||
nltk.download('cmudict') | ||
|
||
# Convert CMUdict into a dictionary. | ||
CMUDICT = cmudict.dict() | ||
|
||
with open(os.path.join(os.path.dirname(__file__), | ||
'p1files/missing_words.json')) as in_file: | ||
# Load local dictionary of words with syllable counts. | ||
# Words as strings are keys and integers are values. | ||
MISSING_WORDS = json.load(in_file) | ||
|
||
|
||
def format_words(words: str) -> list: | ||
"""Format words for processing. | ||
Remove hyphens, convert to lowercase, and strip both punctuation and | ||
possessives from word or phrase. | ||
Args: | ||
words (str): Word or phrase to format for processing. | ||
Returns: | ||
List of strings containing processed words. | ||
""" | ||
words = words.replace('-', ' ') | ||
word_list = words.lower().split() | ||
for i, word in enumerate(word_list): | ||
word = word.strip(punctuation) | ||
if any([word.endswith("'s"), word.endswith("’s")]): | ||
word_list[i] = word[:-2] | ||
else: | ||
word_list[i] = word | ||
return word_list | ||
|
||
|
||
def count_syllables(words: list) -> int: | ||
"""Use CMUdict to count syllables in English word. | ||
Calculate sum of syllable counts for each word in **words**. Checks | ||
syllable counts in the :py:mod:`nltk.corpus` CMUdict phoneme list, if word | ||
is not found in CMUdict, also checks local dictionary with syllable | ||
counts. | ||
Args: | ||
words (list): List of strings to sum number of syllables. | ||
Returns: | ||
Integer representing number of syllables in **words**. | ||
Note: | ||
Defaults to first element in CMUdict phoneme list. So, multiple | ||
syllable counts are ignored. | ||
""" | ||
syllables = 0 | ||
for word in words: | ||
if word in MISSING_WORDS: | ||
syllables += MISSING_WORDS[word] | ||
else: | ||
for phonemes in CMUDICT[word][0]: | ||
for phoneme in phonemes: | ||
Comment on lines
+89
to
+92
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These can be implemented as Here's the thing: I'd be willing to refactor the tests to account for this change in behavior if not for one small detail. Setting values would still be done by doing |
||
if phoneme[-1].isdigit(): | ||
syllables += 1 | ||
return syllables | ||
|
||
|
||
def main(): | ||
"""Demonstrate count_syllables with a word dictionary file.""" | ||
word_list = cleanup_dict(DICTIONARY_FILE_PATH) | ||
sample_list = sample(word_list, 15) | ||
for word in sample_list: | ||
try: | ||
syllables = count_syllables(format_words(word)) | ||
except KeyError: | ||
# Skip words in neither dictionary. | ||
print(f'Not found: {word}') | ||
continue | ||
print(f'{word} {syllables}') | ||
Comment on lines
+100
to
+109
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After reviewing the book's solution, I agree that using |
||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"househusband": 3, "ibices": 3, "smooching": 2, "handpicking": 3, "tuxes": 2} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
a | ||
aardvark | ||
abracadabra | ||
b | ||
bee | ||
boson | ||
c | ||
cat | ||
catatonic | ||
d | ||
dog | ||
dirge | ||
e | ||
echo | ||
ebeneezer | ||
f | ||
fox | ||
finicky | ||
g | ||
gecko | ||
gopher | ||
h | ||
hemoglobin | ||
hermit | ||
i | ||
imp | ||
indigo | ||
j | ||
jack-o-lantern | ||
journey | ||
k | ||
kangaroo | ||
kilometer | ||
l | ||
lemon | ||
lime | ||
m | ||
mesolithic | ||
moonlight | ||
n | ||
none | ||
night | ||
o | ||
opaque | ||
opulent | ||
p | ||
penny | ||
pepper | ||
q | ||
quasar | ||
quark | ||
r | ||
riddle | ||
rubber | ||
s | ||
slight | ||
swift | ||
t | ||
tonberry | ||
tomato | ||
u | ||
ultraviolet | ||
umbra | ||
v | ||
venus | ||
vertiginous | ||
w | ||
whip | ||
whirl | ||
x | ||
xena | ||
xenon | ||
y | ||
yacht | ||
yggdrasil | ||
z | ||
zen | ||
zero |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
Not found: yggdrasil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd love to see a phoneme list for |
||
dog 1 | ||
hermit 2 | ||
jack-o-lantern 4 | ||
journey 2 | ||
bee 1 | ||
abracadabra 5 | ||
penny 2 | ||
hemoglobin 4 | ||
opaque 2 | ||
venus 2 | ||
umbra 2 | ||
cat 1 | ||
whirl 1 | ||
zen 1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
"""Test Chapter 8.""" | ||
import unittest.mock | ||
import os | ||
from random import Random | ||
from io import StringIO | ||
|
||
import src.ch08.p1_count_syllables as count_syllables | ||
|
||
|
||
class TestCountSyllables(unittest.TestCase): | ||
"""Test Count Syllables.""" | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
"""Configure attributes for use in this class only.""" | ||
cls.random = Random() | ||
|
||
def test_format_words(self): | ||
"""Test format_words.""" | ||
# Test convert to lowercase. | ||
for word in ['YOU', 'You', 'yOu', 'yoU', 'yOU', 'YOu', 'YoU', 'you']: | ||
self.assertEqual(count_syllables.format_words(word), ['you']) | ||
# Test remove hyphens. | ||
self.assertEqual(count_syllables.format_words('nit-pick'), ['nit', 'pick']) | ||
# Test remove punctuation. | ||
self.assertEqual(count_syllables.format_words('nit-pick!'), ['nit', 'pick']) | ||
# Test remove possessives. | ||
for word in ['test’s', 'test\'s']: | ||
self.assertEqual(count_syllables.format_words(word), ['test']) | ||
# Test phrase. | ||
self.assertEqual(count_syllables.format_words('TEST nit-pick'), ['test', 'nit', 'pick']) | ||
|
||
def test_count_syllables(self): | ||
"""Test count_syllables.""" | ||
# Test word not in CMUdict. | ||
self.assertEqual(count_syllables.count_syllables(['tuxes']), 2) | ||
# Test word in CMUdict. | ||
self.assertEqual(count_syllables.count_syllables(['test']), 1) | ||
|
||
@unittest.mock.patch('src.ch08.p1_count_syllables.DICTIONARY_FILE_PATH', 'tests/data/ch08/dictionary.txt') | ||
@unittest.mock.patch('sys.stdout', new_callable=StringIO) | ||
@unittest.mock.patch('src.ch08.p1_count_syllables.sample') | ||
def test_main(self, mock_sample, mock_stdout): | ||
"""Test demo main function.""" | ||
self.random.seed(222) | ||
mock_sample.side_effect = self.random.sample | ||
|
||
count_syllables.main() | ||
|
||
# Test sys.stdout output. | ||
with open(os.path.normpath('tests/data/ch08/main/count_syllables.txt'), | ||
'r') as file: | ||
file_data = ''.join(file.readlines()) | ||
self.assertEqual(mock_stdout.getvalue(), file_data) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Caching the
nltk
corpora should relieve load on the nltk servers.