In [144]:
import nltk

Stemming

Stemming is a method in text processing that eliminates prefixes and suffixes from words, transforming them into their fundamental or root form

In [145]:
from nltk.stem.porter import *

In [149]:
words = ['play','plays','playing','player','played','swiftly']

In [150]:
pstemmer = PorterStemmer()

In [151]:
for word in words:
    print(f"{word}\t\t{pstemmer.stem(word)}")

play		play
plays		play
playing		play
player		player
played		play
swiftly		swiftli


In [152]:
from nltk.stem.snowball import SnowballStemmer

In [154]:
words2 = ['machen','suchen','brauchen','kaufen','arbeiten']

In [155]:
sstemmer = SnowballStemmer(language = 'german')

In [156]:
for word in words2:
  print(f"{word}\t\t{sstemmer.stem(word)}")

machen		mach
suchen		such
brauchen		brauch
kaufen		kauf
arbeiten		arbeit


Lemmatization

In [157]:
import spacy
nlp1 = spacy.load('en_core_web_sm')

In [158]:
doc1 = nlp1("I'm searching for my shoes. Have you seen them?")
for token in doc1:
  print(token.text, "\t\t", token.pos_, '\t\t',token.lemma, '\t\t', token.lemma_, '\t\t',token.dep_ )

I 		 PRON 		 4690420944186131903 		 I 		 nsubj
'm 		 AUX 		 10382539506755952630 		 be 		 aux
searching 		 VERB 		 295895373269394349 		 search 		 ROOT
for 		 ADP 		 16037325823156266367 		 for 		 prep
my 		 PRON 		 227504873216781231 		 my 		 poss
shoes 		 NOUN 		 12623266062479156681 		 shoe 		 pobj
. 		 PUNCT 		 12646065887601541794 		 . 		 punct
Have 		 AUX 		 14692702688101715474 		 have 		 aux
you 		 PRON 		 7624161793554793053 		 you 		 nsubj
seen 		 VERB 		 11925638236994514241 		 see 		 ROOT
them 		 PRON 		 16875582379069451158 		 they 		 dobj
? 		 PUNCT 		 8205403955989537350 		 ? 		 punct


German Language

In [159]:
nlp = spacy.load('de_core_news_sm')

In [160]:
doc = nlp("Ich suche meine Schuhe. Hast du sie gesehen?")
#I'm searching for my shoes. Have you seen them?

In [161]:
for token in doc:
  print(token.text, "\t\t", token.pos_, '\t\t',token.lemma, '\t\t', token.lemma_ )

Ich 		 PRON 		 5864527961345014045 		 ich
suche 		 VERB 		 18313823129771624139 		 suchen
meine 		 DET 		 7570793064135359215 		 mein
Schuhe 		 NOUN 		 3240750755112541786 		 Schuh
. 		 PUNCT 		 10501404726543969396 		 --
Hast 		 VERB 		 13575293068407610524 		 Hast
du 		 PRON 		 17166863280368009634 		 du
sie 		 PRON 		 13323500956662843128 		 sie
gesehen 		 VERB 		 5513153705242160378 		 sehen
? 		 PUNCT 		 10501404726543969396 		 --


In [163]:
def show_lemmas(doc3):
    for token in doc3:
        print(f'{token.text:{14}} {token.pos_:{8}} {token.lemma:<{22}} {token.lemma_:{14}} {token.dep_:{14}}')

In [83]:
doc2 = nlp("Der schnelle braune Fuchs springt über den faulen Hund.")
#The quick brown fox jumps over the lazy dog.

In [164]:
show_lemmas(doc2)

Der            DET      9250722957692387333    der            nk            
schnelle       ADJ      12679834086485348841   schnell        nk            
braune         ADJ      16884188031653587216   braun          nk            
Fuchs          PROPN    7428125815056104837    Fuchs          sb            
springt        VERB     10557456600538100282   springen       ROOT          
über           ADP      1502415808165053963    über           mo            
den            DET      9250722957692387333    der            nk            
faulen         ADJ      12760680308645965638   faul           nk            
Hund           NOUN     4759838677326765497    Hund           nk            
.              PUNCT    10501404726543969396   --             punct         


Lemmatization using WordNetLemmatizer

In [165]:
from nltk.stem import WordNetLemmatizer

In [166]:
lemmatizer = WordNetLemmatizer()
text = "faster"
result = lemmatizer.lemmatize(text,pos='a')
print(f"{text}----{result}")

faster----fast


Stop Words

In [167]:
print(nlp.Defaults.stop_words)#german Stop words

{'daher', 'gleich', 'ihren', 'indem', 'endlich', 'gibt', 'drin', 'sondern', 'diesen', 'ihn', 'jeden', 'das', 'beispiel', 'vom', 'ehrlich', 'müssen', 'zehn', 'für', 'großen', 'ihr', 'offen', 'oder', 'nicht', 'ausser', 'durften', 'siebentes', 'drittes', 'ins', 'dieses', 'erstes', 'lange', 'vierten', 'rechtes', 'grossen', 'ging', 'mit', 'her', 'drei', 'auch', 'wo', 'darf', 'darüber', 'seines', 'eigenen', 'gegen', 'sowie', 'neunter', 'machte', 'sehr', 'können', 'keiner', 'lang', 'des', 'na', 'meines', 'eigen', 'jenes', 'solcher', 'en', 'lieber', 'darauf', 'ganze', 'willst', 'wem', 'dass', 'diese', 'sagte', 'eigene', 'manchen', 'solchen', 'dahin', 'konnten', 'weiteres', 'großer', 'daselbst', 'achte', 'wird', 'davon', 'seinem', 'dagegen', 'eben', 'niemanden', 'an', 'ihm', 'gern', 'zehnte', 'seit', 'demgegenüber', 'dementsprechend', 'auf', 'bald', 'sollte', 'muß', 'gross', 'hätten', 'statt', 'will', 'anderem', 'währenddem', 'wann', 'da', 'nur', 'magst', 'zusammen', 'welcher', 'allerdings', 'd