In [1]:
import re

A simple function that is often used in Natural Language Processing: Counting word frequencies.

In [4]:
"""Count words."""
def count_words(text):
    """Count how many times each unique word occurs in text."""
    counts = dict()  # dictionary of { <word>: <count> } pairs to return
    
    # Convert to lowercase
    text_lower = text.lower()
    
    #  remove punctuation
    text_no_pun = re.sub(r'[^\w\s]','',text_lower)
    # Split text into tokens (words)
    text_split = re.split('\W', text_no_pun)
    
    # Aggregate word counts using a dictionary
    for word in text_split:
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1
    return counts


def test_run():
    with open("input.txt", "r") as f:
        text = f.read()
        counts = count_words(text)
        sorted_counts = sorted(counts.items(), key=lambda pair: pair[1], reverse=True)
        
        print("10 most common words:\nWord\tCount")
        for word, count in sorted_counts[:10]:
            print("{}\t{}".format(word, count))
        
        print("\n10 least common words:\nWord\tCount")
        for word, count in sorted_counts[-10:]:
            print("{}\t{}".format(word, count))


if __name__ == "__main__":
    test_run()

10 most common words:
Word	Count
a	9
he	6
the	6
and	5
as	4
was	4
with	3
i	2
of	2
his	2

10 least common words:
Word	Count
tables	1
merry	1
word	1
or	1
slap	1
on	1
for	1
more	1
favoured	1
guests	1
