In [71]:
import pandas as pd

import nltk

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


from nltk.corpus import stopwords
stops = stopwords.words('english')
from string import punctuation

In [76]:
testStr = "This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts."

tokens = nltk.word_tokenize(testStr)
print(tokens)
tokens = nltk.wordpunct_tokenize(testStr)
print(tokens)

['This', 'value', 'is', 'also', 'called', 'cut-off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.']
['This', 'value', 'is', 'also', 'called', 'cut', '-', 'off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.']


In [77]:
df = pd.DataFrame(index = tokens)
df['porter_stemmer'] = [porter_stemmer.stem(t) for t in tokens]
df['lancaster_stemmer'] = [lancaster_stemmer.stem(t) for t in tokens]
df['snowball_stemmer'] = [snowball_stemmer.stem(t) for t in tokens]
df['wordnet_lemmatizer'] = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
df

Unnamed: 0,porter_stemmer,lancaster_stemmer,snowball_stemmer,wordnet_lemmatizer
This,Thi,thi,this,This
value,valu,valu,valu,value
is,is,is,is,is
also,also,also,also,also
called,call,cal,call,called
cut,cut,cut,cut,cut
-,-,-,-,-
off,off,off,off,off
in,in,in,in,in
the,the,the,the,the


In [83]:
count = 15
num = len(stops)//count
for i in range(num):
    print(", ".join(stops[i*num : (i+1)*num]))

i, me, my, myself, we, our, ours, ourselves, you, your
yours, yourself, yourselves, he, him, his, himself, she, her, hers
herself, it, its, itself, they, them, their, theirs, themselves, what
which, who, whom, this, that, these, those, am, is, are
was, were, be, been, being, have, has, had, having, do
does, did, doing, a, an, the, and, but, if, or
because, as, until, while, of, at, by, for, with, about
against, between, into, through, during, before, after, above, below, to
from, up, down, in, out, on, off, over, under, again
further, then, once, here, there, when, where, why, how, all


In [84]:
df = pd.DataFrame(index = [t for t in tokens if t not in stops and t not in punctuation])
df['porter_stemmer'] = [porter_stemmer.stem(t.lower()) for t in tokens if t not in stops and t not in punctuation]
df['lancaster_stemmer'] = [lancaster_stemmer.stem(t.lower()) for t in tokens if t not in stops and t not in punctuation]
df['snowball_stemmer'] = [snowball_stemmer.stem(t.lower()) for t in tokens if t not in stops and t not in punctuation]
df['wordnet_lemmatizer'] = [wordnet_lemmatizer.lemmatize(t.lower()) for t in tokens if t not in stops and t not in punctuation]
df

Unnamed: 0,porter_stemmer,lancaster_stemmer,snowball_stemmer,wordnet_lemmatizer
This,thi,thi,this,this
value,valu,valu,valu,value
also,also,also,also,also
called,call,cal,call,called
cut,cut,cut,cut,cut
literature,literatur,lit,literatur,literature
If,if,if,if,if
float,float,flo,float,float
parameter,paramet,paramet,paramet,parameter
represents,repres,repres,repres,represents


In [89]:
df_tag = pd.DataFrame(index = tokens)
df_tag['default'] = [tag for term, tag in nltk.pos_tag(tokens)]
df_tag['universal'] = [tag for term, tag in nltk.pos_tag(tokens, tagset='universal')]
df_tag

Unnamed: 0,default,universal
This,DT,DET
value,NN,NOUN
is,VBZ,VERB
also,RB,ADV
called,VBN,VERB
cut,VBN,VERB
-,:,.
off,RB,ADV
in,IN,ADP
the,DT,DET
