## **TP 3 - Analyse Lexicale et PreProcessing avec NLTK.**

### **Installer et Importer les librairies modules**

In [None]:
!pip install nltk

In [1]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('wordnet')

In [6]:
# importing tokenization 
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

# importing the stopwords
from nltk.corpus import stopwords

# importing Porter and Lancaster stemmers from nltk
from nltk.stem import PorterStemmer, LancasterStemmer

# importing WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

# importing wordnet
from nltk.corpus import wordnet

### Sentence Tokenization

In [7]:
text = 'This is a text written. It uses U.S. english to illustrate sentence tokenization.'
sents = sent_tokenize(text)

print(sents)

['This is a text written.', 'It uses U.S. english to illustrate sentence tokenization.']


In [8]:
fr_text = "Ce texte est écrit. Il a comme but d'illustrer la segmentation d'un texte en français."
fr_sents = sent_tokenize(fr_text, language='french')

print(fr_sents)

['Ce texte est écrit.', "Il a comme but d'illustrer la segmentation d'un texte en français."]


### Word Tokenization - different algorithms

In [9]:
text = 'This is a text written. It uses U.S. english to illustrate word\'s tokenization.'
words = word_tokenize(text)

print(words)

['This', 'is', 'a', 'text', 'written', '.', 'It', 'uses', 'U.S.', 'english', 'to', 'illustrate', 'word', "'s", 'tokenization', '.']


In [10]:
fr_text = "Ce texte est écrit. Il a comme but d'illustrer la segmentation d'un texte en français."
fr_words = word_tokenize(fr_text, language='french')

print(fr_words)

['Ce', 'texte', 'est', 'écrit', '.', 'Il', 'a', 'comme', 'but', "d'illustrer", 'la', 'segmentation', "d'un", 'texte', 'en', 'français', '.']


In [11]:
# The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. 
s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
tokens = TreebankWordTokenizer().tokenize(s)

print(tokens)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']


In [12]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = tokenizer.tokenize(s)

print(tokens)

['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


In [13]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokens = regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')

print(tokens)

['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


In [14]:
tknzr = TweetTokenizer()
s = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokens = tknzr.tokenize(s)

print(tokens)

['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']


### Stop words suppression

In [15]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'bengali',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [16]:
esw = stopwords.words('english')

esw[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [17]:
fsw = stopwords.words('french')

fsw[0:10]

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle']

In [18]:
words = ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 
         'Please', 'buy', 'me', 'two', 'of', 'them',  '.', 'Thanks', '.']

filtered = [w for w in words if not w.lower() in esw]

print(filtered)

['Good', 'muffins', 'cost', '$3.88', 'New', 'York', '.', 'Please', 'buy', 'two', '.', 'Thanks', '.']


### **Stemming** - Porter and Lancaster

In [19]:
# LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. Aggressive stemming.
# Over-stemming causes the stems to be not linguistic, or they may have no meaning. 
# Lancaster produces an even shorter stem than Porter because of iterations and over-stemming is occurred.

# create an object of class PorterStemmer and LancasterStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()

# - Stemming a word - PorterStemmer
print("- Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print(porter.stem("probably"))

print()

# - Stemming a word - LancasterStemmer 
print("- Lancaster Stemmer")
print(lancaster.stem("cats"))
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))
print(lancaster.stem("probably"))

- Porter Stemmer
cat
troubl
troubl
troubl
probabl

- Lancaster Stemmer
cat
troubl
troubl
troubl
prob


In [20]:
from nltk.stem import snowball
dir(snowball)

['ArabicStemmer',
 'DanishStemmer',
 'DutchStemmer',
 'EnglishStemmer',
 'FinnishStemmer',
 'FrenchStemmer',
 'GermanStemmer',
 'HungarianStemmer',
 'ItalianStemmer',
 'NorwegianStemmer',
 'PorterStemmer',
 'PortugueseStemmer',
 'RomanianStemmer',
 'RussianStemmer',
 'SnowballStemmer',
 'SpanishStemmer',
 'StemmerI',
 'SwedishStemmer',
 '_LanguageSpecificStemmer',
 '_ScandinavianStemmer',
 '_StandardStemmer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'demo',
 'porter',
 'prefix_replace',
 're',
 'stopwords',
 'suffix_replace']

In [21]:
# - Stemming a list of words
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]

print("{0:20}{1:20}{2}".format("Word", "Porter Stemmer", "Lancaster Stemmer"))

for word in word_list:
    print("{0:20}{1:20}{2}".format(word, porter.stem(word), lancaster.stem(word)))

Word                Porter Stemmer      Lancaster Stemmer
friend              friend              friend
friendship          friendship          friend
friends             friend              friend
friendships         friendship          friend
stabil              stabil              stabl
destabilize         destabil            dest
misunderstanding    misunderstand       misunderstand
railroad            railroad            railroad
moonlight           moonlight           moonlight
football            footbal             footbal


In [22]:
# - Stemming a sentence with word tokenization (punctuations are keeped with word_tokenizer)

sentence = "Pythoners are very intelligent, and work very pythonly and now they are pythoning their way to success."

# Tokenization
token_words = word_tokenize(sentence)
print('Tokens:', token_words)

# Stemming
stem_sentence = []
for word in token_words:
    stem_sentence.append(porter.stem(word))
    
print('Stems: ', stem_sentence)

Tokens: ['Pythoners', 'are', 'very', 'intelligent', ',', 'and', 'work', 'very', 'pythonly', 'and', 'now', 'they', 'are', 'pythoning', 'their', 'way', 'to', 'success', '.']
Stems:  ['python', 'are', 'veri', 'intellig', ',', 'and', 'work', 'veri', 'pythonli', 'and', 'now', 'they', 'are', 'python', 'their', 'way', 'to', 'success', '.']


In [31]:
# - Stemming a sentence with tokenization, without stopwords

# downloading stopwords from nltk
nltk.download('stopwords')

# assigning the english stop-words to the sw list
esw = stopwords.words('english')

sentence = "Pythoners are very intelligent, and work very pythonly and now they are pythoning their way to success."

# Tokenization
token_words = word_tokenize(sentence)
print('Tokens - before:', token_words)

# Eliminate the stop words from the tokens
clean_tokens = [token for token in token_words if token not in esw]
print('Tokens - after:', clean_tokens)

# Stemming
stem_sentence = []
for word in clean_tokens:
    stem_sentence.append(porter.stem(word))
    
print('Stems: ', stem_sentence)

Tokens - before: ['Pythoners', 'are', 'very', 'intelligent', ',', 'and', 'work', 'very', 'pythonly', 'and', 'now', 'they', 'are', 'pythoning', 'their', 'way', 'to', 'success', '.']
Tokens - after: ['Pythoners', 'intelligent', ',', 'work', 'pythonly', 'pythoning', 'way', 'success', '.']
Stems:  ['python', 'intellig', ',', 'work', 'pythonli', 'python', 'way', 'success', '.']


In [32]:
# - Stemming a sentence with word tokenization (remove punctuations with RegexpTokenizer)

# downloading stopwords from nltk
nltk.download('stopwords')

# assigning the english stop-words to the sw list
esw = stopwords.words('english')

sentence = "Pythoners are very intelligent, and work very pythonly and now they are pythoning their way to success."

# Tokenization
tokenizer = RegexpTokenizer(r'\w+')
token_words = tokenizer.tokenize(sentence)
print('Tokens - before:', token_words)

# Eliminate the stop words from the tokens
clean_tokens = [token for token in token_words if token not in esw]
print('Tokens - after:', clean_tokens)

# Stemming
stem_sentence = []
for word in clean_tokens:
    stem_sentence.append(porter.stem(word))
    
print('Stems: ', stem_sentence)

Tokens - before: ['Pythoners', 'are', 'very', 'intelligent', 'and', 'work', 'very', 'pythonly', 'and', 'now', 'they', 'are', 'pythoning', 'their', 'way', 'to', 'success']
Tokens - after: ['Pythoners', 'intelligent', 'work', 'pythonly', 'pythoning', 'way', 'success']
Stems:  ['python', 'intellig', 'work', 'pythonli', 'python', 'way', 'success']


### **Lemmatization**  - WordNet Lemmatizer

In [33]:
# - Lemmatization - WordNet Lemmatizer without context

# download wordnet
nltk.download('wordnet')

# instantiating the lemmaztizer object
lemmatizer = WordNetLemmatizer()

# Lemmatize a single word without context
print(lemmatizer.lemmatize("bats"))
print(lemmatizer.lemmatize("feet"))
print(lemmatizer.lemmatize("are"))

# Lemmatize a single word with context
print(lemmatizer.lemmatize("are", pos='v'))
print(lemmatizer.lemmatize("swimming", pos='v'))
print(lemmatizer.lemmatize("swimming", pos='n'))
print(lemmatizer.lemmatize("stripes", pos='v')) 
print(lemmatizer.lemmatize("stripes", pos='n'))

bat
foot
are
be
swim
swimming
strip
stripe


In [26]:
# Diffrence between Stemming and Lemmatization

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word = 'wolves'

stem = stemmer.stem(word)
lemma = lemmatizer.lemmatize(word)

stem, lemma

('wolv', 'wolf')

In [27]:
# - Lemmatize a sentence
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."

# tokenize the sentence into a list of words without punctuations
tokenizer = RegexpTokenizer(r'\w+')
sentence_words = tokenizer.tokenize(sentence)

# without context
print("{0:20}{1}".format("Word", "Lemma"))
for word in sentence_words:
    print ("{0:20}{1}".format(word, lemmatizer.lemmatize(word)))

Word                Lemma
He                  He
was                 wa
running             running
and                 and
eating              eating
at                  at
same                same
time                time
He                  He
has                 ha
bad                 bad
habit               habit
of                  of
swimming            swimming
after               after
playing             playing
long                long
hours               hour
in                  in
the                 the
Sun                 Sun


### Distance - Word similarity

In [28]:
dog = wordnet.synsets('dog')[0]

dog

Synset('dog.n.01')

In [29]:
cat = wordnet.synsets('cat')[0]

cat

Synset('cat.n.01')

In [30]:
# Wu-Palmer Similarity
dog.wup_similarity(cat)

0.8571428571428571