In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Tokenization is a common task when working with text data.**

**It consists of splitting an entire text into small units, also known as tokens.**

**Processing (NLP) projects have tokenization as the first step because it helps better understand the text we have.**

# **Table of Contents**
1. Simple tokenization with .split
2. Tokenization with NLTK
3. Convert a corpus to a vector of token counts with Count Vectorizer (sklearn)
4. Tokenize text using TextBlob

In [2]:
# 1.  Simple tokenization with .split
text = """Here’s to the crazy ones, the misfits, the rebels, the troublemakers, 
the round pegs in the square holes. The ones who see things differently — they’re not fond of 
rules. You can quote them, disagree with them, glorify
or vilify them, but the only thing you can’t do is ignore them because they
change things. They push the human race forward, and while some may see them
as the crazy ones, we see genius, because the ones who are crazy enough to think
that they can change the world, are the ones who do."""
print(text.split())

['Here’s', 'to', 'the', 'crazy', 'ones,', 'the', 'misfits,', 'the', 'rebels,', 'the', 'troublemakers,', 'the', 'round', 'pegs', 'in', 'the', 'square', 'holes.', 'The', 'ones', 'who', 'see', 'things', 'differently', '—', 'they’re', 'not', 'fond', 'of', 'rules.', 'You', 'can', 'quote', 'them,', 'disagree', 'with', 'them,', 'glorify', 'or', 'vilify', 'them,', 'but', 'the', 'only', 'thing', 'you', 'can’t', 'do', 'is', 'ignore', 'them', 'because', 'they', 'change', 'things.', 'They', 'push', 'the', 'human', 'race', 'forward,', 'and', 'while', 'some', 'may', 'see', 'them', 'as', 'the', 'crazy', 'ones,', 'we', 'see', 'genius,', 'because', 'the', 'ones', 'who', 'are', 'crazy', 'enough', 'to', 'think', 'that', 'they', 'can', 'change', 'the', 'world,', 'are', 'the', 'ones', 'who', 'do.']


In [3]:
# 2. Tokenization with NLTK
# output is slightly different from the .split method showed above.
#  the apostrophe (‘) in “here’s” and the comma (,) in “ones,” were considered as tokens.
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
tokens= word_tokenize(text)
print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['Here', '’', 's', 'to', 'the', 'crazy', 'ones', ',', 'the', 'misfits', ',', 'the', 'rebels', ',', 'the', 'troublemakers', ',', 'the', 'round', 'pegs', 'in', 'the', 'square', 'holes', '.', 'The', 'ones', 'who', 'see', 'things', 'differently', '—', 'they', '’', 're', 'not', 'fond', 'of', 'rules', '.', 'You', 'can', 'quote', 'them', ',', 'disagree', 'with', 'them', ',', 'glorify', 'or', 'vilify', 'them', ',', 'but', 'the', 'only', 'thing', 'you', 'can', '’', 't', 'do', 'is', 'ignore', 'them', 'because', 'they', 'change', 'things', '.', 'They', 'push', 'the', 'human', 'race', 'forward', ',', 'and', 'while', 'some', 'may', 'see', 'them', 'as', 'the', 'crazy', 'ones', ',', 'we', 'see', 'genius', ',', 'because', 'the', 'ones', 'who', 'are', 'crazy', 'enough', 'to', 'think', 'that', 'they', 'can', 'change', 'the', 'world', ',', 'are', 'the', 'ones', 'who', 'do', '.']


In [None]:
# another example
from nltk.tokenize import sent_tokenize, word_tokenize
  
text = ("Natural language processing (NLP) is a field " + 
       "of computer science, artificial intelligence " + 
       "and computational linguistics concerned with " +  
       "the interactions between computers and human " +  
       "(natural) languages, and, in particular, " +  
       "concerned with programming computers to " + 
       "fruitfully process large natural language " +  
       "corpora. Challenges in natural language " +  
       "processing frequently involve natural " + 
       "language understanding, natural language" +  
       "generation frequently from formal, machine" +  
       "-readable logical forms), connecting language " +  
       "and machine perception, managing human-" + 
       "computer dialog systems, or some combination " +  
       "thereof.")
  
print(sent_tokenize(text))
print(word_tokenize(text))

['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', 'Challenges in natural language processing frequently involve natural language understanding, natural languagegeneration frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.']
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to'

In [None]:
# 3. Convert a corpus to a vector of token counts with Count Vectorizer (sklearn)
# This becomes extremely useful when the dataframe contains a large corpus
# because it provides a matrix with words encoded as integers values, which are used as inputs in machine learning algorithms.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
texts = ["""Here’s to the crazy ones, the misfits, the rebels, the troublemakers, 
            the round pegs in the square holes. The ones who see things differently — they’re not fond of 
            rules. You can quote them, disagree with them, glorify
            or vilify them, but the only thing you can’t do is ignore them because they
            change things. They push the human race forward, and while some may see them
            as the crazy ones, we see genius, because the ones who are crazy enough to think
            that they can change the world, are the ones who do.""" ,
         'I choose a lazy person to do a hard job. Because a lazy person will find an easy way to do it.']

df = pd.DataFrame({'author': ['jobs', 'gates'], 'text':texts})
df

Unnamed: 0,author,text
0,jobs,"Here’s to the crazy ones, the misfits, the reb..."
1,gates,I choose a lazy person to do a hard job. Becau...


In [None]:
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(df['text'])
df['text']

0    Here’s to the crazy ones, the misfits, the reb...
1    I choose a lazy person to do a hard job. Becau...
Name: text, dtype: object

In [None]:
cv_matrix=cv_matrix.toarray()
cv_matrix

array([[2, 0, 3, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 5, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

In [None]:
df_tokens = pd.DataFrame(cv_matrix, index=df['author'].values, columns=cv.get_feature_names())
df_tokens



Unnamed: 0,change,choose,crazy,differently,disagree,easy,fond,forward,genius,glorify,hard,holes,human,ignore,job,lazy,misfits,ones,pegs,person,push,quote,race,rebels,round,rules,square,thing,things,think,troublemakers,vilify,way,world
jobs,2,0,3,1,1,0,1,1,1,1,0,1,1,1,0,0,1,5,1,0,1,1,1,1,1,1,1,1,2,1,1,1,0,1
gates,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
# 4. Tokenize text using TextBlob

In [None]:
text = ("Natural language processing (NLP) is a field " + 
       "of computer science, artificial intelligence " + 
       "and computational linguistics concerned with " +  
       "the interactions between computers and human " +  
       "(natural) languages, and, in particular, " +  
       "concerned with programming computers to " + 
       "fruitfully process large natural language " +  
       "corpora. Challenges in natural language " +  
       "processing frequently involve natural " + 
       "language understanding, natural language" +  
       "generation frequently from formal, machine" +  
       "-readable logical forms), connecting language " +  
       "and machine perception, managing human-" + 
       "computer dialog systems, or some combination " +  
       "thereof.")

from textblob import TextBlob
# create a TextBlob object
blob_object = TextBlob(text)

# tokenize paragraph into words.
print(" Word Tokenize :\n", blob_object.words)
  
# tokenize paragraph into sentences.
print("\n Sentence Tokenize :\n", blob_object.sentences)
    

 Word Tokenize :
 ['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'field', 'of', 'computer', 'science', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages', 'and', 'in', 'particular', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'languagegeneration', 'frequently', 'from', 'formal', 'machine-readable', 'logical', 'forms', 'connecting', 'language', 'and', 'machine', 'perception', 'managing', 'human-computer', 'dialog', 'systems', 'or', 'some', 'combination', 'thereof']

 Sentence Tokenize :
 [Sentence("Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions

# **Examples**




In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp('THE NLP IS SUBBRANCH OF AI FEILD')


for token in doc:
  print(token.text, token.tag_, token.pos_, token.is_stop )
  


THE DT DET True
NLP NNP PROPN False
IS VBZ VERB True
SUBBRANCH NNP PROPN False
OF IN ADP True
AI NNP PROPN False
FEILD NN NOUN False


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc= nlp('هذه هي الجملة الأولي و هذه هي الجملة الثانية والجملة الثالثة')
for i in doc.sents:
  print(i)

from nltk.tokenize import sent_tokenize,word_tokenize
sents= sent_tokenize('هذه هي الجملة الأولي ., هذه هي الجملة الثانية , والجملة الثالثة')
words= word_tokenize('هذه هي الجملة الأولي ., هذه هي الجملة الثانية , والجملة الثالثة')
print(sents,'\n',words)

هذه هي الجملة الأولي و
هذه هي
الجملة
الثانية
والجملة الثالثة
['هذه هي الجملة الأولي ., هذه هي الجملة الثانية , والجملة الثالثة'] 
 ['هذه', 'هي', 'الجملة', 'الأولي', '.', ',', 'هذه', 'هي', 'الجملة', 'الثانية', ',', 'والجملة', 'الثالثة']
