In [1]:
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/anuj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/anuj/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/anuj/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anuj/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /Users/anuj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## **1.Sentence Tokenization And Word Tokenization** ##

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Hello my name is anuj. And i'm currently pursuig BE Computer Degree from MESWCOE."
words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Word Tokenization:", words)
print("Sentence Tokenization:", sentences)

Word Tokenization: ['Hello', 'my', 'name', 'is', 'anuj', '.', 'And', 'i', "'m", 'currently', 'pursuig', 'BE', 'Computer', 'Degree', 'from', 'MESWCOE', '.']
Sentence Tokenization: ['Hello my name is anuj.', "And i'm currently pursuig BE Computer Degree from MESWCOE."]


## **2.Stop Words** ##
- Contain a common list of words
- With the help of this list we can remove the unimportant words(stop words) from the sentences

In [50]:
from nltk.corpus import stopwords
import re

# Load the common words and store it on the 'stop_words' variable
stop_words = set(stopwords.words("english"))
# print("Stop Words:", stop_words)

text = "Hello my name is anuj. And i'm currently pursuing BE Computer Degree from MESWCOE college"

# Remove anything that's not a letter and replaces it with a space
text = re.sub('[^a-zA-Z]', ' ', text)

words = word_tokenize(text)
filtered_text = []

for word in words:
    if word not in stop_words:
        filtered_text.append(word)

print("Tokenize Sentence:", words)
print("Filtered Sentence:", filtered_text)

Tokenize Sentence: ['Hello', 'my', 'name', 'is', 'anuj', 'And', 'i', 'm', 'currently', 'pursuing', 'BE', 'Computer', 'Degree', 'from', 'MESWCOE', 'college']
Filtered Sentence: ['Hello', 'name', 'anuj', 'And', 'currently', 'pursuing', 'BE', 'Computer', 'Degree', 'MESWCOE', 'college']


## **3.Stemming** ##
- Help to reduced words to their root form


In [30]:
from nltk.stem import PorterStemmer

words = ["Run", "Demured","Jumping"]
ps = PorterStemmer()

for word in words:
    rootWord = ps.stem(word)
    print(rootWord)

run
demur
jump


## **4.Lemmatization** ##

In [46]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)

print(text)

for w in tokenization:
    print("Lemma for {} is {} ".format(w, wordnet_lemmatizer.lemmatize(w)))

studies studying cries cry
Lemma for studies is study 
Lemma for studying is studying 
Lemma for cries is cry 
Lemma for cry is cry 


## **5.POS Tagging** ##
- Label words with their Part Of Speech (POS)
  

In [42]:
import nltk
# uncomment this 
# nltk.download('averaged_perceptron_tagger_eng')

data = "Anuj is learning AI with the help of the book. But when he read the half book. The concept of the book baffled Him"
words = word_tokenize(data)

for word in words:
    print(nltk.pos_tag([word]))

[('Anuj', 'NN')]
[('is', 'VBZ')]
[('learning', 'VBG')]
[('AI', 'NN')]
[('with', 'IN')]
[('the', 'DT')]
[('help', 'NN')]
[('of', 'IN')]
[('the', 'DT')]
[('book.But', 'NN')]
[('when', 'WRB')]
[('he', 'PRP')]
[('read', 'NN')]
[('the', 'DT')]
[('half', 'NN')]
[('book', 'NN')]
[('.', '.')]
[('The', 'DT')]
[('concept', 'NN')]
[('of', 'IN')]
[('the', 'DT')]
[('book', 'NN')]
[('baffled', 'VBN')]
[('Him', 'NN')]


## **6.Term Frequency(TF)** ##

In [52]:
import math

docA = "WADIA COE is one of the best college in pune"
docB = "WADIA COE is one of the best"

# Split into words
wordsA = docA.split()
wordsB = docB.split()

# Unique words from both
uniqueWords = set(wordsA + wordsB)
print("Unique words:", uniqueWords)

# Create word count dictionary
def makeDict(words):
    word_count = dict.fromkeys(words, 0)  # use uniqueWords here
    for word in words:
        word_count[word] += 1
    return word_count

# TF calculation
def compute_TF(word_count, words):
    tf = {}
    total_words = len(words)
    for word, count in word_count.items():
        tf[word] = count / total_words
    return tf

# Calculate
numOfWordsA = makeDict(wordsA)
numOfWordsB = makeDict(wordsB)

tfA = compute_TF(numOfWordsA, wordsA)
tfB = compute_TF(numOfWordsB, wordsB)

# Output
print("\nTF for Document A:\n", tfA)
print("\nTF for Document B:\n", tfB)


Unique words: {'college', 'pune', 'of', 'best', 'in', 'WADIA', 'one', 'is', 'COE', 'the'}

TF for Document A:
 {'WADIA': 0.1, 'COE': 0.1, 'is': 0.1, 'one': 0.1, 'of': 0.1, 'the': 0.1, 'best': 0.1, 'college': 0.1, 'in': 0.1, 'pune': 0.1}

TF for Document B:
 {'WADIA': 0.14285714285714285, 'COE': 0.14285714285714285, 'is': 0.14285714285714285, 'one': 0.14285714285714285, 'of': 0.14285714285714285, 'the': 0.14285714285714285, 'best': 0.14285714285714285}


## **7.IDF** ##

In [30]:
def computeIDF(documents):

  # Length of the document
  N = len(documents)

  # IDF dictionary
  idfDict = dict.fromkeys(documents[0].keys(), 0)

  for document in documents:

    for word, val in document.items():
      if val > 0:
        idfDict[word] += 1

  for word, val in idfDict.items():
    idfDict[word] = math.log(N / float(val))

  return idfDict


idf = computeIDF([numOfWordsA, numOfWordsB])
idf

{'WADIA': 0.0,
 'COE': 0.0,
 'is': 0.0,
 'one': 0.0,
 'of': 0.0,
 'the': 0.0,
 'best': 0.0,
 'college': 0.6931471805599453,
 'in': 0.6931471805599453,
 'pune': 0.6931471805599453}