# Stemming

In [1]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [2]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [3]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [4]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [6]:
words = word_tokenize(new_text)
stemmed_text = ""
for w in words:
    print(ps.stem(w))
    stemmed_text += " " + ps.stem(w)

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


In [7]:
stemmed_text

' It is import to by veri pythonli while you are python with python . all python have python poorli at least onc .'

In [8]:
print(ps.stem('getting'))     # get
print(ps.stem('rabbits'))     # rabbit
print(ps.stem('xyzing'))       # xyze - it even works on non words!
print(ps.stem('quickly'))     # quick
print(ps.stem('slowly'))      # slowli

get
rabbit
xyze
quickli
slowli


# Lemmatizing

In [9]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
 
wnl = WordNetLemmatizer()

# A few more examples
print(wnl.lemmatize('getting', wn.VERB))          # get
print(wnl.lemmatize('rabbits', wn.NOUN))          # rabbit
print(wnl.lemmatize('xyzing', wn.NOUN))            # KeyError! - Doesn't work on non-words!
print(wnl.lemmatize('quickly', wn.ADV))          # quickly
print(wnl.lemmatize('puppy', wn.NOUN))           # slowly
print(wnl.lemmatize('better', wn.ADJ))         # good
print(wnl.lemmatize('oxen', wn.NOUN))        # ox
print(wnl.lemmatize('geese', wn.NOUN))      # goose
print(wnl.lemmatize('automobile', wn.NOUN))      # goose


get
rabbit
xyzing
quickly
puppy
good
ox
goose
automobile


In [17]:
car_synsets = wn.synsets('car')
car_synsets

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [16]:
car_synsets = wn.synsets('date')
car_synsets

[Synset('date.n.01'),
 Synset('date.n.02'),
 Synset('date.n.03'),
 Synset('date.n.04'),
 Synset('date.n.05'),
 Synset('date.n.06'),
 Synset('date.n.07'),
 Synset('date.n.08'),
 Synset('date.v.01'),
 Synset('date.v.02'),
 Synset('date.v.03'),
 Synset('go_steady.v.01'),
 Synset('date.v.05')]

In [13]:
for car in car_synsets:
    print("lemmas: ", car.lemmas())
    print("definition: ", car.definition())
    print("hypernyms:", car.hypernyms())
    print("hyponyms:", car.hyponyms())
    print("antonyms:", car.lemmas()[0].antonyms())
    print('-' * 40, '\n\n')

lemmas:  [Lemma('record.n.01.record')]
definition:  anything (such as a document or a phonograph record or a photograph) providing permanent evidence of or information about past events
hypernyms: [Synset('evidence.n.02')]
hyponyms: [Synset('file.n.01'), Synset('history.n.02'), Synset('memorabilia.n.01'), Synset('stub.n.04'), Synset('working_papers.n.01'), Synset('written_record.n.01')]
antonyms: []
---------------------------------------- 


lemmas:  [Lemma('phonograph_record.n.01.phonograph_record'), Lemma('phonograph_record.n.01.phonograph_recording'), Lemma('phonograph_record.n.01.record'), Lemma('phonograph_record.n.01.disk'), Lemma('phonograph_record.n.01.disc'), Lemma('phonograph_record.n.01.platter')]
definition:  sound recording consisting of a disk with a continuous groove; used to reproduce music by rotating while a phonograph needle tracks in the groove
hypernyms: [Synset('sound_recording.n.01')]
hyponyms: [Synset('lp.n.01'), Synset('seventy-eight.n.02')]
antonyms: []
-----

In [19]:
import nltk 
def max_sim(w1, w2):
    msim = 0
    for s1, s2 in zip(wn.synsets(w1), wn.synsets(w2)):
        sim = s1.path_similarity(s2)
        if sim != None and sim > msim:
            msim = sim
    return msim

words = [("car", "automobile"), 
         ( "gem", "jewel"), 
         ( "journey", "voyage"), 
         ( "boy", "lad"), 
         ( "coast", "shore"), 
         ( "asylum", "madhouse"), 
         ( "magician", "wizard"), 
         ( "midday", "noon"), 
         ( "furnace", "stove"), 
         ( "food", "fruit"), 
         ( "bird", "crane"), 
         ( "tool", "implement"), 
         ( "brother", "monk"), 
         ( "lad", "brother"), 
         ( "crane", "implement"), 
         ( "journey", "car"), 
         ( "monk", "oracle"), 
         ( "cemetery", "woodland"), 
         ( "food", "rooster"), 
         ( "coast", "hill"), 
         ( "forest", "graveyard"), 
         ( "shore", "woodland"), 
         ( "monk", "slave"), 
         ( "coast", "forest"), 
         ( "lad", "wizard"), 
         ( "chord", "smile"), 
         ( "glass", "magician"), 
         ( "rooster", "voyage"), 
         ( "noon", "string")]

paths = {}
for w1, w2 in words:
    paths[(w1, w2)] = max_sim(w1, w2)
    #paths[w1, w2] = max([ s1.path_similarity(s2) for s1, s2 in zip(wn.synsets(w1), wn.synsets(w2)) ])
    
print("\nSimilarity by path_similarity")
for item in nltk.FreqDist(paths).items():
    print(item)



Similarity by path_similarity
(('car', 'automobile'), 1.0)
(('gem', 'jewel'), 0.125)
(('journey', 'voyage'), 0.25)
(('boy', 'lad'), 0.3333333333333333)
(('coast', 'shore'), 0.5)
(('asylum', 'madhouse'), 0.125)
(('magician', 'wizard'), 1.0)
(('midday', 'noon'), 1.0)
(('furnace', 'stove'), 0.07692307692307693)
(('food', 'fruit'), 0.1)
(('bird', 'crane'), 0.1111111111111111)
(('tool', 'implement'), 0.5)
(('brother', 'monk'), 0.125)
(('lad', 'brother'), 0.14285714285714285)
(('crane', 'implement'), 0.1)
(('journey', 'car'), 0.08333333333333333)
(('monk', 'oracle'), 0.125)
(('cemetery', 'woodland'), 0.1111111111111111)
(('food', 'rooster'), 0.0625)
(('coast', 'hill'), 0.2)
(('forest', 'graveyard'), 0.07142857142857142)
(('shore', 'woodland'), 0.2)
(('monk', 'slave'), 0.2)
(('coast', 'forest'), 0.16666666666666666)
(('lad', 'wizard'), 0.2)
(('chord', 'smile'), 0.09090909090909091)
(('glass', 'magician'), 0.1111111111111111)
(('rooster', 'voyage'), 0.041666666666666664)
(('noon', 'string'), 