##**Compute Levenshtein Distance between Text and Key Words for Spam Classification** 

In [40]:
!pip install nltk
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [41]:
from Levenshtein import distance as lev
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Define Keywords and Corpus

In [49]:
# Define the Keywords for Levenshtein Distance
keywords = {
"AST_Task2":["covid",
"vaccine", 
"production", 
"mrna",
"manufacturing",
"chain",
"management", 
"operations", 
"rd", 
"research", 
"development",
"pfizer",
"moderna",
"biontech",
"lonza",
"sanofi", 
"curevac", 
"csl",
"luniabio", 
"knowledge", 
"gap"], 
"AST_Task3":
["covid",
"vaccine", 
"production", 
"mrna",
"manufacturing",
"chain",
"management", 
"operations", 
"rd", 
"research", 
"development",
"pfizer",
"moderna",
"biontech",
"lonza",
"sanofi", 
"curevac", 
"csl",
"luniabio", 
"knowledge", 
"gap"], 
"AST_Task4":
["alpha",
"sights",
"mr",
"ms",
"dr",
"prof",
"sir",
"madam",
"alphasights",
"dear",
"concern",
"yours",
"truly",
"faithfully",
"sincerely",
"regards",
"project",
"covid",
"manufacture",
"mrna",
"vaccine",
"consultation", 
"call",
"chain",
"management", 
"operations",
"rd", 
"research"
]}

In [50]:
# Define the corpus to test
corpus = 'CountVectorizer is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. This is helpful when we have multiple such texts, and we wish to convert each word in each text into vectors (for using in further text analysis). Let us consider a few sample texts from a document (each as a list element):'

### Preprocessing - Remove Punctuations and Stemming

In [51]:
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Lowercase
corpus = corpus.lower()

# Punctuations
corpus_alpha = "".join([char for char in corpus if char not in string.punctuation])

# Tokenization
words = word_tokenize(corpus_alpha)

# Stopword Filtering
stop_words = set(stopwords.words('english'))
filteredWords = [word for word in words if not word in stop_words]

# Stemming
# Optional - We can compare with and without Stemming
stemmer = PorterStemmer()
stemmedWords = [stemmer.stem(word) for word in filteredWords]

In [52]:
# Visualize the words
print("FilteredWords")
print(filteredWords)
print("StemmedWords")
print(stemmedWords)

FilteredWords
['countvectorizer', 'great', 'tool', 'provided', 'scikitlearn', 'library', 'python', 'used', 'transform', 'given', 'text', 'vector', 'basis', 'frequency', 'count', 'word', 'occurs', 'entire', 'text', 'helpful', 'multiple', 'texts', 'wish', 'convert', 'word', 'text', 'vectors', 'using', 'text', 'analysis', 'let', 'us', 'consider', 'sample', 'texts', 'document', 'list', 'element']
StemmedWords
['countvector', 'great', 'tool', 'provid', 'scikitlearn', 'librari', 'python', 'use', 'transform', 'given', 'text', 'vector', 'basi', 'frequenc', 'count', 'word', 'occur', 'entir', 'text', 'help', 'multipl', 'text', 'wish', 'convert', 'word', 'text', 'vector', 'use', 'text', 'analysi', 'let', 'us', 'consid', 'sampl', 'text', 'document', 'list', 'element']


## Generate Levenshtein Matrix

In [53]:
print("FilteredWords")
count0=0
count1=0
count2=0
count3=0
levMatrix = np.zeros((len(filteredWords),len(keywords['AST_Task2'])))
for text_word in range(len(filteredWords)):
  for key_word in range(len(keywords['AST_Task2'])):
    lev_dist = lev(filteredWords[text_word], keywords['AST_Task2'][key_word])
    levMatrix[text_word][key_word] = lev_dist
    if lev_dist == 0:
      count0 += 1
    if lev_dist == 1:
      count1 += 1
    if lev_dist == 2:
      count2 += 1
    if lev_dist == 3:
      count3 += 1

print(np.matrix(levMatrix))
print(".......................................................................")

print("StemmedWords")
counts0=0
counts1=0
counts2=0
counts3=0
levMatrix = np.zeros((len(stemmedWords),len(keywords['AST_Task2'])))
for text_word in range(len(stemmedWords)):
  for key_word in range(len(keywords['AST_Task2'])):
    lev_dist = lev(stemmedWords[text_word], keywords['AST_Task2'][key_word])
    levMatrix[text_word][key_word] = lev_dist
    if lev_dist == 0:
      counts0 += 1
    if lev_dist == 1:
      counts1 += 1
    if lev_dist == 2:
      counts2 += 1
    if lev_dist == 3:
      counts3 += 1

print(np.matrix(levMatrix))
print(".......................................................................")
print("FilteredWords")
print("Edit Distance Count: 0: {}, 1: {}, 2: {}, 3: {}".format(count0, count1, count2, count3))
print("StemmedWords")
print("Edit Distance Count: 0: {}, 1: {}, 2: {}, 3: {}".format(counts0, counts1, counts2, counts3))

FilteredWords
[[11. 11. 12. 14. 10. 13. 13. 12. 14. 13. 12. 11. 12. 11. 12. 12. 11. 14.
  12. 13. 15.]
 [ 5.  7.  8.  3. 11.  5.  7.  7.  4.  6.  9.  6.  6.  7.  5.  6.  4.  5.
   7.  8.  3.]
 [ 4.  7.  8.  4. 12.  5. 10.  8.  4.  8. 10.  6.  6.  7.  4.  5.  7.  3.
   8.  7.  4.]
 [ 4.  6.  7.  7. 13.  7.  9.  7.  6.  8. 10.  5.  7.  7.  7.  7.  7.  8.
   8.  7.  8.]
 [ 9.  9. 10. 10. 11.  8. 10. 10. 10.  9. 11.  8.  9.  8. 10. 10.  9.  9.
  10.  9. 10.]
 [ 7.  7. 10.  6. 11.  6. 10.  8.  6.  6. 10.  6.  7.  7.  5.  7.  7.  7.
   6.  9.  6.]
 [ 6.  6.  6.  6. 11.  5.  9.  6.  6.  8.  9.  5.  6.  7.  6.  6.  7.  6.
   8.  9.  6.]
 [ 4.  7.  9.  4. 12.  5.  9.  9.  3.  6. 10.  5.  6.  7.  5.  6.  5.  3.
   7.  7.  4.]
 [ 9.  8.  9.  7. 10.  8.  9.  8.  8.  7. 10.  8.  9.  8.  8.  6.  9.  8.
   8.  9.  8.]
 [ 4.  6.  9.  5. 12.  4.  7.  9.  5.  7.  8.  4.  5.  6.  5.  6.  6.  5.
   7.  8.  4.]
 [ 5.  7.  9.  4. 12.  5.  8.  8.  4.  7.  9.  6.  6.  6.  5.  6.  6.  4.
   8.  8.  4.]
 [ 6.  