In [1]:
#importing libraries

import nltk
import re    #regular expression
import pandas as pd
# nltk.download("punkt")     # used for Tokenization
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')                  

from nltk.corpus import stopwords  #used for stopwords
from nltk.stem.porter import PorterStemmer # used for stemming
from nltk.stem.wordnet import WordNetLemmatizer # Used for Lemmatization
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
## INPUT TEXT

text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

### 1. Tokenization

In [3]:
# Sentence tokenize
print(sent_tokenize(text))

# Word tokenize
print(word_tokenize(text))

['Natural Language Processing is an exciting area.', 'Huge budget have been allocated for this.']
['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


#### Lower case conversion

In [4]:
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
words = text.split()   # alternate way for tokenize function
print(words)


['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


### 2. Removing stopwords

In [5]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
stopw = stopwords.words("english")

In [7]:
# final_words = []
# for w in words:
#     if w not in stopw:
#         final_words.append(w)

# print(final_words)

In [8]:
words = [ w  for w in words if w not in stopw]
words

['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

### 3. Stemming

In [9]:
stemmer = PorterStemmer()
stemmer.stem('allocated')

'alloc'

In [10]:
stemmed  = [PorterStemmer().stem(w) for w in  words]
stemmed

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']

### 4. Lemmatization

In [11]:
lemm = WordNetLemmatizer()
lemm.lemmatize('allocated')

'allocated'

In [12]:
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in  words]
lemmatized

['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

### EXECUTING IN A SINGLE CELL


In [13]:



text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())  #removing special char and lowercasing
words = text.split()  # Tokenization
stop_rem_words = [ w  for w in words if w not in stopwords.words("english")]   # stop words removal
final_words = [WordNetLemmatizer().lemmatize(w) for w in  stop_rem_words]      # Lemmatization
final_words




['natural',
 'language',
 'processing',
 'exciting',
 'area',
 'huge',
 'budget',
 'allocated']

### Part of speech

In [14]:
# nltk.download('averaged_perceptron_tagger')

In [15]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

tokenized = sent_tokenize(text)
for i in tokenized:
    wordList = word_tokenize(i)
    wordList = [ w  for w in wordList if w not in stopwords.words("english")]  
    tagged = nltk.pos_tag(wordList)
    print(tagged)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('exciting', 'VBG'), ('area', 'NN'), ('.', '.')]
[('Huge', 'NNP'), ('budget', 'NN'), ('allocated', 'VBD'), ('.', '.')]


### Bag of words

In [16]:
doc1 = 'Game of Thrones is an amazing tv series!.'
doc2 = 'Game of Thrones is the best tv series!.'
doc3 = 'Game of Thrones is so great'

In [17]:
l_doc1 = re.sub(r'[^a-zA-Z0-9]',' ',doc1.lower()).split()
l_doc2 = re.sub(r'[^a-zA-Z0-9]',' ',doc2.lower()).split()
l_doc3 = re.sub(r'[^a-zA-Z0-9]',' ',doc3.lower()).split()

In [18]:
l_doc1

['game', 'of', 'thrones', 'is', 'an', 'amazing', 'tv', 'series']

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = 'english')
x = vectorizer.fit_transform([doc1, doc2, doc3])
x

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [20]:
x.toarray()

array([[1, 0, 1, 0, 1, 1, 1],
       [0, 1, 1, 0, 1, 1, 1],
       [0, 0, 1, 1, 0, 1, 0]], dtype=int64)

In [21]:
vectorizer.get_feature_names_out()

array(['amazing', 'best', 'game', 'great', 'series', 'thrones', 'tv'],
      dtype=object)

In [22]:
dfBoW = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names_out())
dfBoW

Unnamed: 0,amazing,best,game,great,series,thrones,tv
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0


In [23]:
vectorizer.vocabulary_

{'game': 2,
 'thrones': 5,
 'amazing': 0,
 'tv': 6,
 'series': 4,
 'best': 1,
 'great': 3}

In [24]:
# bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = 'english',ngram_range=(1,2))
x = vectorizer.fit_transform([doc1, doc2, doc3])


dfBoW_bigram = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names_out())
dfBoW_bigram

Unnamed: 0,amazing,amazing tv,best,best tv,game,game thrones,great,series,thrones,thrones amazing,thrones best,thrones great,tv,tv series
0,1,1,0,0,1,1,0,1,1,1,0,0,1,1
1,0,0,1,1,1,1,0,1,1,0,1,0,1,1
2,0,0,0,0,1,1,1,0,1,0,0,1,0,0


In [25]:
vectorizer.vocabulary_

{'game': 4,
 'thrones': 8,
 'amazing': 0,
 'tv': 12,
 'series': 7,
 'game thrones': 5,
 'thrones amazing': 9,
 'amazing tv': 1,
 'tv series': 13,
 'best': 2,
 'thrones best': 10,
 'best tv': 3,
 'great': 6,
 'thrones great': 11}

In [26]:
# only bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = 'english',ngram_range=(2,2))
x = vectorizer.fit_transform([doc1, doc2, doc3])


dfBoW_only_bigram = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names_out())
dfBoW_only_bigram


Unnamed: 0,amazing tv,best tv,game thrones,thrones amazing,thrones best,thrones great,tv series
0,1,0,1,1,0,0,1
1,0,1,1,0,1,0,1
2,0,0,1,0,0,1,0


In [27]:
vectorizer.vocabulary_

{'game thrones': 2,
 'thrones amazing': 3,
 'amazing tv': 0,
 'tv series': 6,
 'thrones best': 4,
 'best tv': 1,
 'thrones great': 5}

### TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
d1 = 'Petrol cars are cheaper than diesel cars'
d2 = 'diesel is cheaper than petrol'

In [30]:
corpus = [d1,d2]

In [34]:
tfvec = TfidfVectorizer(stop_words='english',ngram_range=(1,3),max_features=10)
tfvec_fit = tfvec.fit_transform(corpus)
print(f'Feature names found : {tfvec.get_feature_names_out()}')

Feature names found : ['cars' 'cars cheaper' 'cars cheaper diesel' 'cheaper' 'cheaper diesel'
 'cheaper diesel cars' 'cheaper petrol' 'diesel' 'diesel cars' 'petrol']


In [37]:
df_tfIdf = pd.DataFrame(tfvec_fit.toarray(),columns = tfvec.get_feature_names_out())
df_tfIdf

Unnamed: 0,cars,cars cheaper,cars cheaper diesel,cheaper,cheaper diesel,cheaper diesel cars,cheaper petrol,diesel,diesel cars,petrol
0,0.616664,0.308332,0.308332,0.219381,0.308332,0.308332,0.0,0.219381,0.308332,0.219381
1,0.0,0.0,0.0,0.448321,0.0,0.0,0.630099,0.448321,0.0,0.448321


### Cosine similarity

In [39]:
import numpy as np
from numpy.linalg import norm


In [54]:
d1 = 'the food is good and great'
d2 = 'the food is not good'

vectorizer = CountVectorizer(stop_words='english')
x= vectorizer.fit_transform([d1,d2])
dfBoW = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dfBoW

Unnamed: 0,food,good,great
0,1,1,1
1,1,1,0


In [57]:
d1_count = [1,	1,	1]
d2_count = [1,	1,	0]

In [58]:
cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f'cosine similarity = {cosine}')

cosine similarity = 0.8164965809277259


In [66]:
#trying with tfidf

d1 = 'the food is good and great'
d2 = 'the food is not good'

vectorizer = TfidfVectorizer(stop_words='english')
x= vectorizer.fit_transform([d1,d2])
dftfidf = pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
dftfidf

Unnamed: 0,food,good,great
0,0.501549,0.501549,0.704909
1,0.707107,0.707107,0.0


In [67]:
d1_count = [0.501549,	0.501549,	0.704909]
d2_count = [0.707107,	0.707107,	0.000000]

In [68]:
cosine = np.dot(d1_count,d2_count)/(norm(d1_count)*norm(d2_count))
print(f'cosine similarity = {cosine}')

cosine similarity = 0.7092975763535904
