In [30]:
# ! pip install nltk

In [31]:
# ! pip show nltk

In [32]:
import nltk
import re
import pandas as pd
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [33]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

### Tokenization

In [34]:
# sent tokenize
print(sent_tokenize(text))

#word tokenize
print(word_tokenize(text))

['Natural Language Processing is an exciting area.', 'Huge budget have been allocated for this.']
['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


### Lower Casing

In [35]:
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
words = text.split()
print(words)

['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


### Stop Word Removal

In [36]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [37]:
final_words=[]
for w in words:
    if w not in stopwords.words("english"):
        final_words.append(w)
final_words

['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

In [38]:
words=[w for w in words if w not in stopwords.words("english")]
print(words)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Stemming

In [39]:
stemmer = PorterStemmer()
stemmer.stem('huge')

'huge'

In [40]:
stemmed=[PorterStemmer().stem(w) for w in  words]
print(stemmed)

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']


### Lemmatization

In [41]:
# Reduce words to their root form
lemmed = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmed)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Executing In A Single Cell

In [42]:
final_text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
words = text.split()  
stop_rem_words = [ w  for w in words if w not in stopwords.words("english")]
final_words = [WordNetLemmatizer().lemmatize(w) for w in  stop_rem_words]
print(final_words)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Parts Of Speech

In [43]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."
# nltk.download('averaged_perceptron_tagger')

In [44]:
txt = "Natural language processing is an exciting area.Huge budget have been allocated for this."
# sent_tokenize is one of instances of
# PunktSentenceTokenizer from the nltk.tokenize.punkt module
tokenized = sent_tokenize(txt)
for i in tokenized:
  # Word tokenizers is used to find the words
  # and punctuation in a string
  wordsList = nltk.word_tokenize(i)
  # removing stop words from wordList
  wordsList = [ w  for w in words if w not in stopwords.words("english")]
  # Using a Tagger. Which is part-of-speech
  # tagger or POS-tagger.
  tagged = nltk.pos_tag(wordsList)
  print(tagged)

[('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('exciting', 'VBG'), ('area', 'NN'), ('huge', 'JJ'), ('budget', 'NN'), ('allocated', 'VBD')]


## Bag Of Words

In [45]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

In [46]:
l_doc1=re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()
l_doc2=re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()
l_doc3=re.sub(r"[^a-zA-Z0-9]"," ",doc3.lower()).split()

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,3))  #ngram=to retain the meaning of sentence 
x = vectorizer.fit_transform([doc1,doc2,doc3])
dfBow= pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,amazing,amazing tv,amazing tv series,best,best tv,best tv series,game,game thrones,game thrones amazing,game thrones best,...,great,series,thrones,thrones amazing,thrones amazing tv,thrones best,thrones best tv,thrones great,tv,tv series
0,1,1,1,0,0,0,1,1,1,0,...,0,1,1,1,1,0,0,0,1,1
1,0,0,0,1,1,1,1,1,0,1,...,0,1,1,0,0,1,1,0,1,1
2,0,0,0,0,0,0,1,1,0,0,...,1,0,1,0,0,0,0,1,0,0


## TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
doc1="petrol cars are cheaper than diesel cars"
doc2="diesel is cheaper than petrol"
doc_corpus=[doc1,doc2]
print(doc_corpus)

['petrol cars are cheaper than diesel cars', 'diesel is cheaper than petrol']


In [59]:
tfVec=TfidfVectorizer(stop_words='english',ngram_range=(1,3),max_features=10)
matrix=tfVec.fit_transform(doc_corpus)
print("Feature Names==",tfVec.get_feature_names_out())

Feature Names== ['cars' 'cars cheaper' 'cars cheaper diesel' 'cheaper' 'cheaper diesel'
 'cheaper diesel cars' 'cheaper petrol' 'diesel' 'diesel cars' 'petrol']


In [60]:
dfBow= pd.DataFrame(matrix.toarray(),columns=tfVec.get_feature_names_out())
dfBow.head()

Unnamed: 0,cars,cars cheaper,cars cheaper diesel,cheaper,cheaper diesel,cheaper diesel cars,cheaper petrol,diesel,diesel cars,petrol
0,0.616664,0.308332,0.308332,0.219381,0.308332,0.308332,0.0,0.219381,0.308332,0.219381
1,0.0,0.0,0.0,0.448321,0.0,0.0,0.630099,0.448321,0.0,0.448321


In [61]:
print("Sparse Matrix n",matrix.shape,"n",matrix.toarray())

Sparse Matrix n (2, 10) n [[0.61666358 0.30833179 0.30833179 0.21938061 0.30833179 0.30833179
  0.         0.21938061 0.30833179 0.21938061]
 [0.         0.         0.         0.44832087 0.         0.
  0.63009934 0.44832087 0.         0.44832087]]


### Cosine SImilarity -Bag of Words

In [66]:
import numpy as np
from numpy.linalg import norm 

d1="The Food Is good and great"
d2="The Food is not good"

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')  #ngram=to retain the meaning of sentence 
x = vectorizer.fit_transform([d1,d2])
dfBow= pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBow.head()

Unnamed: 0,food,good,great
0,1,1,1
1,1,1,0


In [69]:
d1_count=[1,1,1]
d2_count=[1,1,0]

cosine=np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f"Cosine Similarity Is:{cosine}")

Cosine Similarity Is:0.8164965809277259


In [72]:
import numpy as np
from numpy.linalg import norm 

from sklearn.feature_extraction.text import TfidfVectorizer
d1="The Food Is good and great"
d2="The Food is not good"
doc_corpus=[d1,d2]

tfVec=TfidfVectorizer(stop_words='english',ngram_range=(1,1),max_features=10)
matrix=tfVec.fit_transform(doc_corpus)

dfBow= pd.DataFrame(matrix.toarray(),columns=tfVec.get_feature_names_out())
dfBow.head()

Unnamed: 0,food,good,great
0,0.501549,0.501549,0.704909
1,0.707107,0.707107,0.0


In [73]:
d1_count=[0.501549,0.501549,0.704909]
d2_count=[0.707107 ,0.707107,0.000000]

cosine=np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f"Cosine Similarity Is:{cosine}")

Cosine Similarity Is:0.7092975763535904
