In [1]:
from timeit import timeit

sentence = "Python is fun, yet complex ! " \
           "Using libraries is a way to ease your life around some problems."

"""
The most commonly used method to count words in a sentence is pourly optimized.
On top of beeing slow, it does not allow proper counting without further operations.
"""
timer = timeit()
words_split = sentence.split()
print(f"it took {timer}s to split the sentence.\n" \
      f"If we count words : {len(words_split)}," \
      f"We can see the count is not accurate, since '!' is considered a letter here.\n {words_split}")

it took 0.01084689999999977s to split the sentence.
If we count words : 18,We can see the count is not accurate, since '!' is considered a letter here.
 ['Python', 'is', 'fun,', 'yet', 'complex', '!', 'Using', 'libraries', 'is', 'a', 'way', 'to', 'ease', 'your', 'life', 'around', 'some', 'problems.']


In [2]:
import re

"""
In regex, \w+ is [a-zA-Z-]+ meaning it will only match ascii_letters
Where + means 'find as much as you can in a row'
"""
timer = timeit()
words = re.findall(r"\w+", sentence)  
print(f"it took {timer}s to regex the sentence. if we count words : {len(words)}\n"\
       "We can see the count is accurate and pretty fast: {words}")

it took 0.020138599999999673s to regex the sentence. if we count words : 17
We can see the count is accurate and pretty fast: {words}


In [3]:
from collections import Counter

"""
Now, if we want to count each words or letters used in the text, we want to use Counter.
It is part of collections built-in libraries and allow fast, optimized traversal of hashable objects.
"""
timer = timeit()
# Since we know regex will get for us plain words in an optimized fashion way,
# we are going to time it all, to observe how fast the process is.
words = re.findall(r"\w+", sentence)
words_counter = Counter(words)
letters_counter = Counter()

# I chose to use index traversal, since it does not create an object for each iteration
# Therefore, it is memory friendly and faster.
for i in range(len(words)):
    letters_counter.update(words[i])

print(f"{words_counter}")
print(f"{letters_counter}")
print(f"Counter come with many usefull methods")
print(f"which word is the most used in sentence : {words_counter.most_common(1)}")
print(f"which 3 letters are most used in sentence : {letters_counter.most_common(3)}")
print(f"it took {timer}s to use regex, count plain words and count letters occurences")

Counter({'is': 2, 'Python': 1, 'fun': 1, 'yet': 1, 'complex': 1, 'Using': 1, 'libraries': 1, 'a': 1, 'way': 1, 'to': 1, 'ease': 1, 'your': 1, 'life': 1, 'around': 1, 'some': 1, 'problems': 1})
Counter({'e': 8, 'o': 7, 's': 7, 'i': 6, 'r': 5, 'a': 5, 'y': 4, 'n': 4, 'l': 4, 't': 3, 'u': 3, 'm': 3, 'f': 2, 'p': 2, 'b': 2, 'P': 1, 'h': 1, 'c': 1, 'x': 1, 'U': 1, 'g': 1, 'w': 1, 'd': 1})
Counter come with many usefull methods
which word is the most used in sentence : [('is', 2)]
which 3 letters are most used in sentence : [('e', 8), ('o', 7), ('s', 7)]
it took 0.01479879999999989s to use regex, count plain words and count letters occurences
