In [3]:
import nltk
import spacy
import re #regular expression

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [6]:
text = "The cute cats are playing in the garden!"

### Text Cleaning

In [7]:
text = text.lower()
text = re.sub(r"[^a-z\s]", "", text)

print(text)

the cute cats are playing in the garden


### Tokenization

In [8]:
tokens = word_tokenize(text)
print(tokens)

['the', 'cute', 'cats', 'are', 'playing', 'in', 'the', 'garden']


### Stop Words Removal

In [12]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]
print(filtered_tokens)

['cute', 'cats', 'playing', 'garden']


### Stemming: finding the root words

In [15]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

print([porter.stem(word) for word in filtered_tokens])
print([lancaster.stem(word) for word in filtered_tokens])
print([snowball.stem(word) for word in filtered_tokens])

['cute', 'cat', 'play', 'garden']
['cut', 'cat', 'play', 'gard']
['cute', 'cat', 'play', 'garden']


### Lemmatization: Grammer root word finding more accurate

In [17]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print(lemmatized_words)

['cute', 'cat', 'playing', 'garden']


### Lemmatization using POS

In [21]:
pos_tags = pos_tag(filtered_tokens)

lemmatized = []

for word, tag in pos_tags:
    if tag.startswith("V"):
        lemmatized.append(lemmatizer.lemmatize(word, pos="v"))

    else:
        lemmatized.append(lemmatizer.lemmatize(word))

In [19]:
print(lemmatized)

['cute', 'cat', 'play', 'garden']


### POS tagging (Part of Speach)

In [22]:
pos = pos_tag(tokens)
print(pos)

[('the', 'DT'), ('cute', 'NN'), ('cats', 'NNS'), ('are', 'VBP'), ('playing', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('garden', 'NN')]


### Chunking (Noun Phrase)

In [28]:
grammar = "NP: {<DT>?<JJ>*<NN.*>+}"
chunk_parser = nltk.RegexpParser(grammar)

tree = chunk_parser.parse(pos)
print(tree)

(S
  (NP the/DT cute/NN cats/NNS)
  are/VBP
  playing/VBG
  in/IN
  (NP the/DT garden/NN))


### Named Entity Recognition (NER)

In [30]:
ner_tree = ne_chunk(pos)
print(ner_tree)

(S
  the/DT
  cute/NN
  cats/NNS
  are/VBP
  playing/VBG
  in/IN
  the/DT
  garden/NN)


In [32]:
## Example 2
text2 = "Virat Kohli plays cricket for India"
tokens2 = word_tokenize(text2)
pos2 = pos_tag(tokens2)
ner = ne_chunk(pos2)
print(ner)

(S
  (PERSON Virat/NNP)
  (ORGANIZATION Kohli/NNP)
  plays/VBZ
  cricket/NN
  for/IN
  (GPE India/NNP))


### Bag of Words( BoW )

In [33]:
corpus = [
    "cute cute cat",
    "cute dog",
    "smart dog"
]

bow = CountVectorizer()
x_bow = bow.fit_transform(corpus)

print(bow.get_feature_names_out())
print(x_bow.toarray())

['cat' 'cute' 'dog' 'smart']
[[1 2 0 0]
 [0 1 1 0]
 [0 0 1 1]]


### TF-IDF

In [34]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)

print(tfidf.get_feature_names_out())
print(X_tfidf.toarray())

['cat' 'cute' 'dog' 'smart']
[[0.54935123 0.83559154 0.         0.        ]
 [0.         0.70710678 0.70710678 0.        ]
 [0.         0.         0.60534851 0.79596054]]


### Word2Vec

In [35]:
from gensim.models import Word2Vec

sentences = [
    ["cute", "cat"],
    ["cute", "dog"],
    ["smart", "cat"]
]

model = Word2Vec(sentences, vector_size=50, window=3, min_count=1)

print(model.wv["cat"])
print(model.wv.similarity("cat", "dog"))

[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]
-0.0144752655


### spaCy