<h1> Word Tokenization </h1>

In [4]:
#Work tokenization
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
# Splits at space 
text.split() 

['Founded',
 'in',
 '2002,',
 'SpaceX’s',
 'mission',
 'is',
 'to',
 'enable',
 'humans',
 'to',
 'become',
 'a',
 'spacefaring',
 'civilization',
 'and',
 'a',
 'multi-planet',
 'species',
 'by',
 'building',
 'a',
 'self-sustaining',
 'city',
 'on',
 'Mars.',
 'In',
 '2008,',
 'SpaceX’s',
 'Falcon',
 '1',
 'became',
 'the',
 'first',
 'privately',
 'developed',
 'liquid-fuel',
 'launch',
 'vehicle',
 'to',
 'orbit',
 'the',
 'Earth.']

<h1>Sentence Tokenization</h1>

<h5>This is similar to work tokenization. Here, we study the structure of sentences in the analysis. A sentence usually ends with a full stop (.), so we can use "." as a separator to break the string:</h5>

In [5]:
#Sentence Tokenization
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
# Splits at '.' 
text.split('. ') 

['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on Mars',
 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']

<h1> Tokenization using Regular Expression (RegEx) </1>


In [10]:
#word tokenization
import re
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
tokens = re.findall("[\w]+", text)
print(tokens)

['Founded', 'in', '2002', 'SpaceX', 's', 'mission', 'is', 'to', 'enable', 'humans', 'to', 'become', 'a', 'spacefaring', 'civilization', 'and', 'a', 'multi', 'planet', 'species', 'by', 'building', 'a', 'self', 'sustaining', 'city', 'on', 'Mars', 'In', '2008', 'SpaceX', 's', 'Falcon', '1', 'became', 'the', 'first', 'privately', 'developed', 'liquid', 'fuel', 'launch', 'vehicle', 'to', 'orbit', 'the', 'Earth']


In [11]:
#sentence tokenization
import re
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on, Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
sentences = re.compile('[.!?] ').split(text)
sentences

['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on, Mars',
 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']

<h1> Natural language toolkit (NLTK)</h1>

In [9]:
pip install nltk==3.9.1

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hoangvulinh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hoangvulinh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
from nltk.tokenize import word_tokenize
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
word_tokenize(text) 

['Founded',
 'in',
 '2002',
 ',',
 'SpaceX',
 '’',
 's',
 'mission',
 'is',
 'to',
 'enable',
 'humans',
 'to',
 'become',
 'a',
 'spacefaring',
 'civilization',
 'and',
 'a',
 'multi-planet',
 'species',
 'by',
 'building',
 'a',
 'self-sustaining',
 'city',
 'on',
 'Mars',
 '.',
 'In',
 '2008',
 ',',
 'SpaceX',
 '’',
 's',
 'Falcon',
 '1',
 'became',
 'the',
 'first',
 'privately',
 'developed',
 'liquid-fuel',
 'launch',
 'vehicle',
 'to',
 'orbit',
 'the',
 'Earth',
 '.']

In [6]:
#Sentence Tokenization
from nltk.tokenize import sent_tokenize
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
sent_tokenize(text)

['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on Mars.',
 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']

<h1> Tokenization using the spaCy library </h1>

In [8]:
!pip install -U spacy
!python -m spacy download en

Collecting spacy
  Downloading spacy-3.8.3-cp310-cp310-macosx_11_0_arm64.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting thinc<8.4.0,>=8.3.0
  Downloading thinc-8.3.3-cp310-cp310-macosx_11_0_arm64.whl (779 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.4/779.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.10-cp310-cp310-macosx_11_0_arm64.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting langcodes<4.0.0,>=3.2.0
  Using cached langcodes-3.5.0-py3-none-any.whl (182 kB)
Collecting wasabi<1.2.0,>=0.9.1
  Using cached wasabi-1.1.3-py3-none-any.whl (27 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.9-cp310-cp310-macosx_11_0_arm64.whl (127 kB)
[2K     [90m━━━━━━━━━━━━

In [9]:
#Word tokenization

In [10]:
from spacy.lang.en import English
#load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [11]:
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
# "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

#Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)

token_list

['Founded',
 'in',
 '2002',
 ',',
 'SpaceX',
 '’s',
 'mission',
 'is',
 'to',
 'enable',
 'humans',
 'to',
 'become',
 'a',
 'spacefaring',
 'civilization',
 'and',
 'a',
 'multi',
 '-',
 'planet',
 '\n',
 'species',
 'by',
 'building',
 'a',
 'self',
 '-',
 'sustaining',
 'city',
 'on',
 'Mars',
 '.',
 'In',
 '2008',
 ',',
 'SpaceX',
 '’s',
 'Falcon',
 '1',
 'became',
 'the',
 'first',
 'privately',
 'developed',
 '\n',
 'liquid',
 '-',
 'fuel',
 'launch',
 'vehicle',
 'to',
 'orbit',
 'the',
 'Earth',
 '.']

In [13]:
#Sentence Tokenization
from spacy.lang.en import English
#Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()
# #Create the pipeline 'sentencizer' component

# sbd = nlp.create_pipe('sentencizer')

#Add the component to the pipeline
nlp.add_pipe("sentencizer")

text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""

#"nlp" Object is used to create documents with linguistic annotations.

doc = nlp(text)
#create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)

sents_list

['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on Mars.',
 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']

<h1> Tokenization using Keras </h1>

In [14]:
!pip install Keras

Collecting Keras
  Downloading keras-3.8.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting optree
  Downloading optree-0.13.1-cp310-cp310-macosx_11_0_arm64.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.8/311.8 kB[0m [31m377.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting h5py
  Downloading h5py-3.12.1-cp310-cp310-macosx_11_0_arm64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m347.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting absl-py
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m823.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ml-dtypes
  Downloading ml_dtypes-0.5.1-cp310-cp310-macosx_10_9_universal2.whl (671 kB)
[2K     [90

In [20]:
!pip install tensorflow



In [19]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

# Define text
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization 
and a multi-planet species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 
became the first privately developed liquid-fuel launch vehicle to orbit the Earth."""

# Tokenize text
tokens = text_to_word_sequence(text)

print(tokens)


['founded', 'in', '2002', 'spacex’s', 'mission', 'is', 'to', 'enable', 'humans', 'to', 'become', 'a', 'spacefaring', 'civilization', 'and', 'a', 'multi', 'planet', 'species', 'by', 'building', 'a', 'self', 'sustaining', 'city', 'on', 'mars', 'in', '2008', 'spacex’s', 'falcon', '1', 'became', 'the', 'first', 'privately', 'developed', 'liquid', 'fuel', 'launch', 'vehicle', 'to', 'orbit', 'the', 'earth']


<h1> Tokenization using Gensim </h1>

In [21]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy<1.14.0,>=1.7.0
  Downloading scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl (30.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.3/30.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.0
    Uninstalling scipy-1.15.0:
      Successfully uninstalled scipy-1.15.0
Successfully installed gensim-4.3.3 scipy-1.13.1


In [22]:
#Word Tokenization
from gensim.utils import tokenize
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
list(tokenize(text))

['Founded',
 'in',
 'SpaceX',
 's',
 'mission',
 'is',
 'to',
 'enable',
 'humans',
 'to',
 'become',
 'a',
 'spacefaring',
 'civilization',
 'and',
 'a',
 'multi',
 'planet',
 'species',
 'by',
 'building',
 'a',
 'self',
 'sustaining',
 'city',
 'on',
 'Mars',
 'In',
 'SpaceX',
 's',
 'Falcon',
 'became',
 'the',
 'first',
 'privately',
 'developed',
 'liquid',
 'fuel',
 'launch',
 'vehicle',
 'to',
 'orbit',
 'the',
 'Earth']

In [31]:
#Sentence Tokenization
from gensim.summarization.textcleaner import split_sentences
text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""
result = split_sentences(text)
result

['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet ',
 'species by building a self-sustaining city on Mars.',
 'In 2008, SpaceX’s Falcon 1 became the first privately developed ',
 'liquid-fuel launch vehicle to orbit the Earth.']

In [30]:
!pip install gensim==3.4.0

Collecting gensim==3.4.0
  Downloading gensim-3.4.0.tar.gz (22.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25ldone
[?25h  Created wheel for gensim: filename=gensim-3.4.0-cp310-cp310-macosx_11_0_arm64.whl size=22417427 sha256=e7b6c91661ce63274d0e52c98bb0a290f3617ede20c3bb87d1b6815181b4962e
  Stored in directory: /Users/hoangvulinh/Library/Caches/pip/wheels/16/56/72/c8bcd3a4035940aebd17219a71ba4d53e28da1323103b05614
Successfully built gensim
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.2
    Uninstalling gensim-3.8.2:
      Successfully uninstalled gensim-3.8.2
Successfully installed gensim-3.4.0
