In [1]:
import nltk

### Tokenizer

In [2]:
from nltk.tokenize import word_tokenize
text="Hey, I am Hemant. I am from computer department"
print(word_tokenize(text))

['Hey', ',', 'I', 'am', 'Hemant', '.', 'I', 'am', 'from', 'computer', 'department']


In [4]:
from nltk.tokenize import sent_tokenize
text="Hey, I am Hemant. I am from computer department"
print(sent_tokenize(text))

['Hey, I am Hemant.', 'I am from computer department']


### Stemming

In [7]:
from nltk.stem import PorterStemmer
words=["wait","waiting","waited","waits"]
ps=PorterStemmer()
for i in words:
    rootword=ps.stem(i)
    print(rootword)

wait
wait
wait
wait


### Limmatisation

Limmatisation is better than stemming

#### Stemming code

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
text="studies studying cries crying"
tokenization=nltk.word_tokenize(text)
for i in tokenization:
    print(ps.stem(i))

studi
studi
cri
cri


#### Lemmiatisation code

In [10]:
from nltk.stem import WordNetLemmatizer
wl=WordNetLemmatizer()
text="studies studying cries crying"
tokenization=nltk.word_tokenize(text)
for i in tokenization:
    print(wl.lemmatize(i))

study
studying
cry
cry


### Stop words

stop words : the, is, but, are .....

In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

data="Hey, I am Hemant. I am from computer department"
stWords=set(stopwords.words('english'))
words = word_tokenize(data)
wordFiltered=[]

for i in words:
    if i not in stWords:
        wordFiltered.append(i)

print(wordFiltered)

['Hey', ',', 'I', 'Hemant', '.', 'I', 'computer', 'department']


In [13]:
print(len(stWords))
print(stWords)

179
{"shouldn't", 'we', 'here', 'most', 'have', 'after', 'do', 'while', 'himself', 'until', 'with', 'aren', "needn't", 'ma', 'y', 'that', 'their', 'then', 'up', 'where', 'very', 't', "mightn't", 'when', 'above', "doesn't", 'out', 'each', 'hasn', 'from', "hasn't", 'through', 'both', "won't", 'our', 'm', 'can', 'more', "you've", 're', "you'll", "should've", "wasn't", 'and', 'needn', 'mightn', 'has', 'how', 'yours', 'was', 'as', 'there', 'were', 'myself', 'off', 'those', 'not', 'been', "that'll", 'be', 'which', 'such', 'your', 'why', 'o', 'ourselves', "couldn't", 'couldn', 'me', "hadn't", 'a', 'wouldn', 'my', 'further', 'being', 'they', 'so', "you're", 'same', 'did', 'between', 'an', "haven't", 'haven', "mustn't", 'will', 'during', 'ain', 'her', 'below', 'only', "she's", 'hers', 'having', 'on', 'than', 'some', 'is', 'hadn', "aren't", 'doing', 'had', 'because', 'what', 'yourself', 'it', 'under', 'isn', 'them', 'its', 'shouldn', 'he', 'll', 'shan', 'this', 'for', 'other', 'by', 'into', 'now

### POS Tagging (Part Of Speech Tagging) or POST

It is a process of converting a sentence to forms â€“ list of words, list of tuples 
(where each tuple is having a form (word, tag)). The tag in case of is a part-of-speech tag, 
and signifies whether the word is a noun, adjective, verb, and so on.

In [14]:
from nltk.tag import DefaultTagger
tagging = DefaultTagger("NN")
tagging.tag(["Hello", "World"])

[('Hello', 'NN'), ('World', 'NN')]

In [16]:
from nltk.tag import DefaultTagger
tagging = DefaultTagger("NN")
tagging.tag_sents([["Hello", "World"], ["hey", "there"]])

[[('Hello', 'NN'), ('World', 'NN')], [('hey', 'NN'), ('there', 'NN')]]

In [18]:
from nltk.tag import untag
untag([('Hello', 'NN'), ('World', 'NN'), ('hey', 'NN'), ('there', 'NN')])

['Hello', 'World', 'hey', 'there']

### Term Frequency and Inverse Document Frequency(Tf-Idf)

term frequency:
t-->word
d-->document
tf(t,d) = count of t in d / number of words in d

document frequency:
df(t) = occurrence of t in documents

Inverse Document Frequency:
df(t) = N(t)
where
df(t) = Document frequency of a term t
N(t) = Number of documents containing the term t

idf(t) = N/ df(t) = N/N(t)

tf-idf(t, d) = tf(t, d) * idf(t)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'Geeks for geeks'
d1 = 'Geeks'
d2 = 'r2j'

# merge documents into a single corpus
string = [d0, d1, d2]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names(), tfidf.idf_):
    print(ele1, ':', ele2)


idf values:
for : 1.6931471805599454
geeks : 1.2876820724517808
r2j : 1.6931471805599454




In [20]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)
 
# display tf-idf values
print('\ntf-idf value:')
print(result)
 
# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())


Word indexes:
{'geeks': 1, 'for': 0, 'r2j': 2}

tf-idf value:
  (0, 0)	0.5493512310263033
  (0, 1)	0.8355915419449176
  (1, 1)	1.0
  (2, 2)	1.0

tf-idf values in matrix form:
[[0.54935123 0.83559154 0.        ]
 [0.         1.         0.        ]
 [0.         0.         1.        ]]
