# Stemming in NLP

In [None]:
#!pip install nltk
#nltk.download('punkt')

In [1]:
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
word = ['change','changing','changes','changed']

In [4]:
word

['change', 'changing', 'changes', 'changed']

In [5]:
from nltk.stem import PorterStemmer

In [6]:
p = PorterStemmer()

In [7]:
for w in word:
    print(p.stem(w))

chang
chang
chang
chang


In [8]:
for w in word:
    print(w , p.stem(w))

change chang
changing chang
changes chang
changed chang


In [9]:
sen = 'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [10]:
sen

'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [11]:
from nltk.tokenize import word_tokenize

In [12]:
token = word_tokenize(sen)

In [13]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [None]:
#sen.split()

In [14]:
for w in token:
    print(p.stem(w))

the
constant
flux
of
life
necessit
embrac
chang
,
whether
it
adapt
to
the
chang
around
us
or
activ
chang
ourselv
to
meet
new
challeng
.


In [19]:
 nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Lemmatization in NLP

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
le = WordNetLemmatizer()

In [22]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [23]:
for w in token:
    print(le.lemmatize(w))

The
constant
flux
of
life
necessitates
embracing
change
,
whether
it
adapting
to
the
change
around
u
or
actively
changing
ourselves
to
meet
new
challenge
.


# Tokenization in NLP

In Python, there are several libraries and tools available for performing tokenization and other NLP tasks. Here are a few examples using popular libraries

# NLTK

NLTK (Natural Language Toolkit) is a widely used library for NLP tasks. To perform tokenization using NLTK, you need to install it first. You can do so by running pip install nltk. Here's an example of tokenizing a sentence using NLTK

In [25]:
from nltk.tokenize import word_tokenize, sent_tokenize

sentence = "I'm from my home ctg. I am learning NLP. It is fascinating!"
word_tokens = word_tokenize(sentence)
sentence_tokens = sent_tokenize(sentence)

print(word_tokens)
print(sentence_tokens)


['I', "'m", 'from', 'my', 'home', 'ctg', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']
["I'm from my home ctg.", 'I am learning NLP.', 'It is fascinating!']


# spaCy

spaCy is another powerful library for NLP. To install spaCy, you can run pip install spacy and then download the appropriate language model. Here's an example of tokenization using spaCy

In [29]:
!pip install spacy
# python -m spacy download en_core_web_sm    #-> install in conda



In [31]:
import spacy

spc = spacy.load('en_core_web_sm')  # Load the English language model

sentence = "I'm from my home ctg. I am learning NLP. It is fascinating!"
doc = spc(sentence)

word_tokens = [token.text for token in doc]

print(word_tokens)


['I', "'m", 'from', 'my', 'home', 'ctg', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


# Transformers

Transformers is a library built by Hugging Face that provides state-of-the-art pre-trained models for NLP. It offers various functionalities, including tokenization. To install Transformers, run pip install transformers. Here's an example of tokenization using Transformers

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sentence = "I'm from my home ctg. I am learning NLP. It is fascinating!"
tokens = tokenizer.tokenize(sentence)

print(tokens)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['i', "'", 'm', 'from', 'my', 'home', 'ct', '##g', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK (Natural Language Toolkit), you can utilize the named entity recognition (NER) functionality provided by NLTK. Here's an example of how to extract named entity tokens from a sentence using NLTK

In [33]:
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')  # Download the required resource (NER models)
nltk.download('words')  # Download the required resource (word corpus)
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [34]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!, Hasan vai, my name is Joe"

tokens = word_tokenize(sentence) # Tokenize the sentence into words

pos_tags = pos_tag(tokens) # Perform part-of-speech tagging

ner_tags = ne_chunk(pos_tags) # Perform named entity recognition

named_entity_tokens = []

for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        named_entity_tokens.append(' '.join(c[0] for c in chunk))

print(named_entity_tokens)

['aiQuest Intelligence', 'NLP', 'Hasan', 'Joe']


In [35]:
sentence2 = "Shakil Lives in Germany"
tokens = word_tokenize(sentence2)
pos_tags = pos_tag(tokens)

In [36]:
pos_tags

[('Shakil', 'NNP'), ('Lives', 'VBZ'), ('in', 'IN'), ('Germany', 'NNP')]

# Text Vectorizer

In [38]:
import pandas as pd
df = pd.read_excel('/content/data-NLP.xlsx')

In [39]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


# Text Processing

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [42]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))
en_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [43]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [44]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [45]:
len(string.punctuation)

32

In [46]:
li = [1,2,3,4,54]
[l for l in li ]

[1, 2, 3, 4, 54]

In [47]:
[l for l in li if l%2==0]

[2, 4, 54]

In [48]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


In [49]:
def preprocess_text(text):

    remove_punc = [char for char in text if char not in string.punctuation] # Remove punctuation
    clean_words = ''.join(remove_punc) # char joining

    #Remove stopwords
    text = ([word for word in clean_words.split() if word.lower() not in en_stopwords]) # stopword = stopwords.words('english')
    return text

In [50]:
df['text'] = df['text'].apply(preprocess_text)

In [51]:
df['text']

0     [Hey, love, Bangladesh]
1    [Good, afternoon, happy]
2             [live, Germany]
3           [Nice, meet, man]
4                    [iPhone]
Name: text, dtype: object

In [52]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

In [53]:
df['text'] = df['text'].apply(lemmatize_text)
df.head()

Unnamed: 0,text,class
0,Hey love Bangladesh,1
1,Good afternoon happy,1
2,live Germany,1
3,Nice meet man,1
4,iPhone,0


# CountVectorizer

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [55]:
cv = CountVectorizer()

In [56]:
cv_x = cv.fit_transform(df['text'])
cv_x

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [57]:
cv_x.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])

In [58]:
cv_df = pd.DataFrame(cv_x.toarray())
cv_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,1,0,0,0,1,0,0,1,0,0,0
1,1,0,0,1,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,0,1,0,0,0,0,0


In [59]:
cv.get_feature_names_out()

array(['afternoon', 'bangladesh', 'germany', 'good', 'happy', 'hey',
       'iphone', 'live', 'love', 'man', 'meet', 'nice'], dtype=object)

In [60]:
cv_df = pd.DataFrame(cv_x.toarray(), index=df['text'], columns=cv.get_feature_names_out())

In [None]:
cv_df

Unnamed: 0_level_0,afternoon,bangladesh,germany,good,happy,hey,iphone,live,love,man,meet,nice
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Hey love Bangladesh,0,1,0,0,0,1,0,0,1,0,0,0
Good afternoon happy,1,0,0,1,1,0,0,0,0,0,0,0
live Germany,0,0,1,0,0,0,0,1,0,0,0,0
Nice meet man,0,0,0,0,0,0,0,0,0,1,1,1
iPhone,0,0,0,0,0,0,1,0,0,0,0,0


# TfidfVectorizer

In [61]:
tf = TfidfVectorizer()

In [62]:
tf_z = tf.fit_transform(df['text'])

In [63]:
tf_z

<5x12 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [64]:
cv_df = pd.DataFrame(tf_z.toarray(), index=df['text'], columns=tf.get_feature_names_out())

In [65]:
cv_df

Unnamed: 0_level_0,afternoon,bangladesh,germany,good,happy,hey,iphone,live,love,man,meet,nice
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Hey love Bangladesh,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0
Good afternoon happy,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0
live Germany,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0
Nice meet man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.57735
iPhone,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Word2Vec

In [66]:
!pip install gensim



In [67]:
from gensim.models import Word2Vec, KeyedVectors

In [68]:
text_vector = [nltk.word_tokenize(test) for test in df['text']]
text_vector

[['Hey', 'love', 'Bangladesh'],
 ['Good', 'afternoon', 'happy'],
 ['live', 'Germany'],
 ['Nice', 'meet', 'man'],
 ['iPhone']]

In [69]:
model = Word2Vec(text_vector, min_count=1) #shift+tab

In [70]:
model

<gensim.models.word2vec.Word2Vec at 0x79ba40c58a00>

In [71]:
model.wv.most_similar('happy')

[('meet', 0.1459505707025528),
 ('love', 0.05048206448554993),
 ('Nice', 0.041577354073524475),
 ('Germany', 0.03476494178175926),
 ('live', 0.01915225386619568),
 ('iPhone', 0.01613469421863556),
 ('Good', 0.008826175704598427),
 ('afternoon', 0.004842504393309355),
 ('Bangladesh', 0.0019510749261826277),
 ('Hey', -0.08382604271173477)]

In [72]:
model.wv.most_similar('afternoon')

[('Hey', 0.11117951571941376),
 ('love', 0.1088901162147522),
 ('iPhone', 0.09291724115610123),
 ('happy', 0.00484249135479331),
 ('meet', -0.0027540253940969706),
 ('Nice', -0.013679751195013523),
 ('Germany', -0.028491031378507614),
 ('live', -0.05774581804871559),
 ('Bangladesh', -0.09326908737421036),
 ('man', -0.11555545777082443)]