In [30]:
corpus = """In the heart of the city, amidst skyscrapers and bustling streets, lies a cozy café. 
It's a place where friends gather for coffee, pastries, and good conversation. 
As the aroma of freshly brewed coffee fills the air, patrons relax and unwind. 
Outside, the city's rhythm pulses on, but inside, time slows down, offering a moment of tranquility."""

In [31]:
print(corpus)

In the heart of the city, amidst skyscrapers and bustling streets, lies a cozy café. 
It's a place where friends gather for coffee, pastries, and good conversation. 
As the aroma of freshly brewed coffee fills the air, patrons relax and unwind. 
Outside, the city's rhythm pulses on, but inside, time slows down, offering a moment of tranquility.


In [32]:
# Tokenization
# paragraphs --> sentences
from nltk.tokenize import sent_tokenize

documents = sent_tokenize(corpus)
print(documents)

['In the heart of the city, amidst skyscrapers and bustling streets, lies a cozy café.', "It's a place where friends gather for coffee, pastries, and good conversation.", 'As the aroma of freshly brewed coffee fills the air, patrons relax and unwind.', "Outside, the city's rhythm pulses on, but inside, time slows down, offering a moment of tranquility."]


In [33]:
for sentence in documents:
    print(sentence)

In the heart of the city, amidst skyscrapers and bustling streets, lies a cozy café.
It's a place where friends gather for coffee, pastries, and good conversation.
As the aroma of freshly brewed coffee fills the air, patrons relax and unwind.
Outside, the city's rhythm pulses on, but inside, time slows down, offering a moment of tranquility.


In [34]:
# word_tokenize can be used to convert paragraph into words or sentences into words
from nltk.tokenize import word_tokenize

words = word_tokenize(corpus)
print(words)

['In', 'the', 'heart', 'of', 'the', 'city', ',', 'amidst', 'skyscrapers', 'and', 'bustling', 'streets', ',', 'lies', 'a', 'cozy', 'café', '.', 'It', "'s", 'a', 'place', 'where', 'friends', 'gather', 'for', 'coffee', ',', 'pastries', ',', 'and', 'good', 'conversation', '.', 'As', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'fills', 'the', 'air', ',', 'patrons', 'relax', 'and', 'unwind', '.', 'Outside', ',', 'the', 'city', "'s", 'rhythm', 'pulses', 'on', ',', 'but', 'inside', ',', 'time', 'slows', 'down', ',', 'offering', 'a', 'moment', 'of', 'tranquility', '.']


In [35]:
for sentence in documents:
    print(word_tokenize(sentence))

['In', 'the', 'heart', 'of', 'the', 'city', ',', 'amidst', 'skyscrapers', 'and', 'bustling', 'streets', ',', 'lies', 'a', 'cozy', 'café', '.']
['It', "'s", 'a', 'place', 'where', 'friends', 'gather', 'for', 'coffee', ',', 'pastries', ',', 'and', 'good', 'conversation', '.']
['As', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'fills', 'the', 'air', ',', 'patrons', 'relax', 'and', 'unwind', '.']
['Outside', ',', 'the', 'city', "'s", 'rhythm', 'pulses', 'on', ',', 'but', 'inside', ',', 'time', 'slows', 'down', ',', 'offering', 'a', 'moment', 'of', 'tranquility', '.']


In [36]:
# wordpunct_tokenize is used to make sure that punctuation is also treated as a seperate word.
from nltk.tokenize import wordpunct_tokenize

wordpunct_tokenize(corpus)

['In',
 'the',
 'heart',
 'of',
 'the',
 'city',
 ',',
 'amidst',
 'skyscrapers',
 'and',
 'bustling',
 'streets',
 ',',
 'lies',
 'a',
 'cozy',
 'café',
 '.',
 'It',
 "'",
 's',
 'a',
 'place',
 'where',
 'friends',
 'gather',
 'for',
 'coffee',
 ',',
 'pastries',
 ',',
 'and',
 'good',
 'conversation',
 '.',
 'As',
 'the',
 'aroma',
 'of',
 'freshly',
 'brewed',
 'coffee',
 'fills',
 'the',
 'air',
 ',',
 'patrons',
 'relax',
 'and',
 'unwind',
 '.',
 'Outside',
 ',',
 'the',
 'city',
 "'",
 's',
 'rhythm',
 'pulses',
 'on',
 ',',
 'but',
 'inside',
 ',',
 'time',
 'slows',
 'down',
 ',',
 'offering',
 'a',
 'moment',
 'of',
 'tranquility',
 '.']

In [37]:
# TreeBankWordTokenizer is used to make sure that fullstops are not treated as seperate words.
# The last fullstop ONLY will be considered as a seperate word
# The fullstops in the middle of the paragraph will be included with the previous word. 
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['In',
 'the',
 'heart',
 'of',
 'the',
 'city',
 ',',
 'amidst',
 'skyscrapers',
 'and',
 'bustling',
 'streets',
 ',',
 'lies',
 'a',
 'cozy',
 'café.',
 'It',
 "'s",
 'a',
 'place',
 'where',
 'friends',
 'gather',
 'for',
 'coffee',
 ',',
 'pastries',
 ',',
 'and',
 'good',
 'conversation.',
 'As',
 'the',
 'aroma',
 'of',
 'freshly',
 'brewed',
 'coffee',
 'fills',
 'the',
 'air',
 ',',
 'patrons',
 'relax',
 'and',
 'unwind.',
 'Outside',
 ',',
 'the',
 'city',
 "'s",
 'rhythm',
 'pulses',
 'on',
 ',',
 'but',
 'inside',
 ',',
 'time',
 'slows',
 'down',
 ',',
 'offering',
 'a',
 'moment',
 'of',
 'tranquility',
 '.']