In [2]:
# import nltk
# nltk.download('all')

In [3]:
import nltk

# 1.1 Sentences tokenization
- Dividing paragraph to sentences

In [4]:
text_paragraph = "Move Forward now represents the bulk of the parliamentary opposition \
but will retain significant political clout having won the majority of seats in \
and around the capital Bangkok and taken over key urban centres and some conservative strongholds. \
The Harvard-educated Pita, 43, was twice denied by parliament in his efforts to become prime minister \
as military-appointed senators closed ranks to stop Move Forward, some over its controversial plans \
to amend a law that insulates the monarchy from criticism."

In [5]:
sent_tokenize_list = nltk.sent_tokenize(text_paragraph)

len(sent_tokenize_list)

2

# 1.2 Word tokenization
- Segment sentences to the list of words

In [6]:
sentence = "Let's see how it's working."
list_of_words = nltk.word_tokenize(sentence)

print(F"List of words :\n\n{ list_of_words }\n\nnumber of words : { len(list_of_words) }")

List of words :

['Let', "'s", 'see', 'how', 'it', "'s", 'working', '.']

number of words : 8


In [7]:
# This method will include apostrophe
nltk.RegexpTokenizer("[\w']+").tokenize(sentence)

# "TabTokenizer" is a word tokenizer by tab or \t
# "LineTokenizer" is a word tokenizer by line or \n

["Let's", 'see', 'how', "it's", 'working']

In [8]:
tweet = "This is a coool #Dummysmiley: :-) :-P <3 and some arrows <- ->"

# This method will count hashtag as a word
nltk.TweetTokenizer().tokenize(tweet)

['This',
 'is',
 'a',
 'coool',
 '#Dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<-',
 '->']

# Probability
- Frequency

In [9]:
fcount = nltk.FreqDist(sentence)

print( fcount )
print( F"The most frequency word : { fcount.most_common(2) }" )

<FreqDist with 15 samples and 27 outcomes>
The most frequency word : [(' ', 4), ('e', 3)]


# 2.1 Stopword

In [10]:
# Unrequire word in english
stop_word = set( nltk.corpus.stopwords.words("english") )

# stop_word

In [33]:
# filter_sen = []
# for w in nltk.word_tokenize(sentence) :
#     if w not in stop_word :
#         filter_sen.append(w)

# or use this method

filter_sen = [ w for w in nltk.word_tokenize(sentence) if w not in stop_word ]

print( F"Before using stop word :\n{nltk.word_tokenize(sentence)}\n" )
print( F"After using stop word :\n{filter_sen}\n" )

Before using stop word :
['Let', "'s", 'see', 'how', 'it', "'s", 'working', '.']

After using stop word :
['Let', "'s", 'see', "'s", 'working', '.']



# 3.Normalization
- STEM
- LEMMA

- Porter

In [12]:
ps = nltk.stem.PorterStemmer()

sentence1 = ['love','loved','loving','loves']
sentence2 = nltk.word_tokenize("Programmers program with programming languages")
sentence3 = ['fly','flying','corpus','corpura','study','studying','studies']

for w in sentence1 :
    print( w," : ",ps.stem(w) )

print(" ")

for w in sentence2 :
    print( w," : ",ps.stem(w) )

print(" ")

for w in sentence3 :
    print( w," : ",ps.stem(w) )

love  :  love
loved  :  love
loving  :  love
loves  :  love
 
Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag
 
fly  :  fli
flying  :  fli
corpus  :  corpu
corpura  :  corpura
study  :  studi
studying  :  studi
studies  :  studi


- Snowball

In [13]:
snow = nltk.stem.SnowballStemmer("english")

for w in sentence1 :
    print( w," : ",snow.stem(w) )

print(" ")

for w in sentence2 :
    print( w," : ",snow.stem(w) )

print(" ")

for w in sentence3 :
    print( w," : ",snow.stem(w) )

love  :  love
loved  :  love
loving  :  love
loves  :  love
 
Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag
 
fly  :  fli
flying  :  fli
corpus  :  corpus
corpura  :  corpura
study  :  studi
studying  :  studi
studies  :  studi


# Annotation
Named entity tagging

- Penn treebank POS tagset 

In [14]:
# grammar classification
# for more details : https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

for sen in nltk.word_tokenize(sentence) :
    print( nltk.pos_tag( nltk.word_tokenize( sen ) ) )

# or nltk.tokenize.PunktSentenceTokenizer

[('Let', 'VB')]
[("'s", 'POS')]
[('see', 'VB')]
[('how', 'WRB')]
[('it', 'PRP')]
[("'s", 'POS')]
[('working', 'VBG')]
[('.', '.')]


# Semantic
Giving a meaning of words

In [16]:
syn = nltk.corpus.wordnet.synsets('bank')

syn

[Synset('bank.n.01'),
 Synset('depository_financial_institution.n.01'),
 Synset('bank.n.03'),
 Synset('bank.n.04'),
 Synset('bank.n.05'),
 Synset('bank.n.06'),
 Synset('bank.n.07'),
 Synset('savings_bank.n.02'),
 Synset('bank.n.09'),
 Synset('bank.n.10'),
 Synset('bank.v.01'),
 Synset('bank.v.02'),
 Synset('bank.v.03'),
 Synset('bank.v.04'),
 Synset('bank.v.05'),
 Synset('deposit.v.02'),
 Synset('bank.v.07'),
 Synset('trust.v.01')]

In [17]:
syn[0].examples()

['they pulled the canoe up on the bank',
 'he sat on the bank of the river and watched the currents']

In [18]:
syn[0].definition()

'sloping land (especially the slope beside a body of water)'

In [23]:
myword = 'good'

syn = []
ant = []

for synset in nltk.corpus.wordnet.synsets(myword) :
    for lemm in synset.lemmas():
        syn.append(lemm.name())
        
        if lemm.antonyms() :
            ant.append( lemm.antonyms()[0].name() )

print( F"Synonym : {syn}\nAntonym : {ant}" )

Synonym : ['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good']
Antonym : ['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


- To chunk the given list of tagged token

In [32]:
parse_tree = nltk.ne_chunk( nltk.pos_tag(text_paragraph.split()), binary=True )

# binary = True --> identify NE
# binary = False --> identify NE categories

# print( parse_tree )

In [31]:
# name_entities = list()
# for t in parse_tree.subtrees():
#     if t.label() == 'NE' :
#         name_entities.append(t)

# or

name_entities = [ t for t in parse_tree.subtrees() if t.label() == 'NE' ]

print(name_entities)

[Tree('NE', [('Move', 'NNP'), ('Forward', 'NNP')]), Tree('NE', [('Bangkok', 'NNP')]), Tree('NE', [('Move', 'NNP')])]


In [2]:
import gensim

print(gensim.__version__)

4.3.0
