# **Tokenization**

1. Word Tokenization

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
text="Natural language processing (NLP) is a machine learning technology that gives computers the ability to interpret, manipulate, and comprehend human language."
word_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'machine',
 'learning',
 'technology',
 'that',
 'gives',
 'computers',
 'the',
 'ability',
 'to',
 'interpret',
 ',',
 'manipulate',
 ',',
 'and',
 'comprehend',
 'human',
 'language',
 '.']

In [3]:
# ! pip install nltk

2. Sentence Tokenization

In [4]:
from nltk.tokenize import sent_tokenize
text="Hello everyone. In this colab we are makeing sentence tokenizer. Using NLTK library"
sent_tokenize(text)

['Hello everyone.',
 'In this colab we are makeing sentence tokenizer.',
 'Using NLTK library']

# **Stemming**

1. Porter Stemmer

In [5]:
import nltk
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()

text='The runner was running more faster than others.'

words=nltk.word_tokenize(text)

stemmer=[porter_stemmer.stem(word) for word in words]

print('Words',words)

print("Stemmed Words:",stemmer)

Words ['The', 'runner', 'was', 'running', 'more', 'faster', 'than', 'others', '.']
Stemmed Words: ['the', 'runner', 'wa', 'run', 'more', 'faster', 'than', 'other', '.']


2. Snowball Stemmer

In [6]:
from nltk.stem import SnowballStemmer

snowball_stemmwer=SnowballStemmer('english')

snowball_stemmeed_words=[snowball_stemmwer.stem(word) for word in words]

print('Original Words:',words)

print("Afer Appling Snowball Stemmer:",snowball_stemmeed_words)

Original Words: ['The', 'runner', 'was', 'running', 'more', 'faster', 'than', 'others', '.']
Afer Appling Snowball Stemmer: ['the', 'runner', 'was', 'run', 'more', 'faster', 'than', 'other', '.']


In [7]:
from nltk.stem import SnowballStemmer

snowball_stem=SnowballStemmer('english')

new_words= ['running', 'jumped', 'happily', 'quickly', 'foxes']

new_stem=[snowball_stem.stem(word) for word in new_words]

print("Original_words:",new_words)

print("After Snowball Stemming:",new_stem)

Original_words: ['running', 'jumped', 'happily', 'quickly', 'foxes']
After Snowball Stemming: ['run', 'jump', 'happili', 'quick', 'fox']


# **Lemmatization**

In [9]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()

print('rocks:',lemmatizer.lemmatize('rocks'))
print('better:',lemmatizer.lemmatize('better'))
print('better:',lemmatizer.lemmatize('better',pos='a'))

[nltk_data] Downloading package wordnet to /root/nltk_data...


rocks: rock
better: better
better: good


# **Named Entity Recognition (NER)**

In [10]:
# !pip install spacy
# ! python -m spacy download en_core_web_sm

In [11]:
import spacy
import pandas as pd
nlp=spacy.load('en_core_web_sm')

content= "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."
doc=nlp(content)

for ent in doc.ents:
  print(ent.text,"|", ent.start_char,"|",ent.end_char,"|",ent.label_)

Congress | 10 | 18 | ORG
Mahua Moitra | 26 | 38 | PERSON
the Supreme Court | 49 | 66 | ORG
the Lok Sabha | 94 | 107 | PERSON
Moitra | 157 | 163 | ORG
Parliament | 184 | 194 | ORG
last week | 195 | 204 | DATE
the Ethics Committee | 211 | 231 | ORG
Darshan Hiranandani | 373 | 392 | PERSON


In [12]:
from spacy import displacy
displacy.render(doc,style='ent')

# TF-**IDF**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
d0='hello my name is melbin mathew'
d1='melbin mathew'
d2='hello'

string=[d0,d1,d2]


In [15]:
tfidf=TfidfVectorizer()

result=tfidf.fit_transform(string)

print('\nWords')
print(tfidf.vocabulary_)
print('\ntf-idf value')
print(result)
print('\n tfidf values in matrix form ')
print(result.toarray())


Words
{'hello': 0, 'my': 4, 'name': 5, 'is': 1, 'melbin': 3, 'mathew': 2}

tf-idf value
  (0, 2)	0.3494981241087058
  (0, 3)	0.3494981241087058
  (0, 1)	0.45954803293870056
  (0, 5)	0.45954803293870056
  (0, 4)	0.45954803293870056
  (0, 0)	0.3494981241087058
  (1, 2)	0.7071067811865476
  (1, 3)	0.7071067811865476
  (2, 0)	1.0

 tfidf values in matrix form 
[[0.34949812 0.45954803 0.34949812 0.34949812 0.45954803 0.45954803]
 [0.         0.         0.70710678 0.70710678 0.         0.        ]
 [1.         0.         0.         0.         0.         0.        ]]


# **Bag of Words**

In [16]:
import re
from sklearn.feature_extraction.text import CountVectorizer

corpus=[
    "Natural Language Processing is fun!",
    "I love learning about Natural Language Processing.",
    "Text analysis with Python is interesting."
]

def preprocess_text(text):
  text=text.lower()
  text = re.sub(r'[^a-z\s]', '', text)
  test=re.sub(r'\s+',' ',text)
  return text.strip()

processed_corpus=[preprocess_text(doc) for doc in corpus]

vectorizer=CountVectorizer()
x=vectorizer.fit_transform(processed_corpus)

print("Vocabulary:",vectorizer.vocabulary_)
print("Encoded Documents:\n", x.toarray())

print("Feature Names:", vectorizer.get_feature_names_out())

Vocabulary: {'natural': 8, 'language': 5, 'processing': 9, 'is': 4, 'fun': 2, 'love': 7, 'learning': 6, 'about': 0, 'text': 11, 'analysis': 1, 'with': 12, 'python': 10, 'interesting': 3}
Encoded Documents:
 [[0 0 1 0 1 1 0 0 1 1 0 0 0]
 [1 0 0 0 0 1 1 1 1 1 0 0 0]
 [0 1 0 1 1 0 0 0 0 0 1 1 1]]
Feature Names: ['about' 'analysis' 'fun' 'interesting' 'is' 'language' 'learning' 'love'
 'natural' 'processing' 'python' 'text' 'with']


# **Count Vectorizer**

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

document=[
    "Natural Language Processing is fun!",
    "I love learning about Natural Language Processing.",
    "Text analysis with Python is interesting."
]

vectorizer=CountVectorizer()

vectorizer.fit(document)

print("Vocabulary:", vectorizer.vocabulary_)

vector=vectorizer.transform(document)

print('Encodered Document is:')
print(vector.toarray())

Vocabulary: {'natural': 8, 'language': 5, 'processing': 9, 'is': 4, 'fun': 2, 'love': 7, 'learning': 6, 'about': 0, 'text': 11, 'analysis': 1, 'with': 12, 'python': 10, 'interesting': 3}
Encodered Document is:
[[0 0 1 0 1 1 0 0 1 1 0 0 0]
 [1 0 0 0 0 1 1 1 1 1 0 0 0]
 [0 1 0 1 1 0 0 0 0 0 1 1 1]]


# **Word2Vec**

In [18]:
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize,word_tokenize
documents = [
    '''This is the first sentence.
    This is the second sentence.
    And the third one.
    Is this the first sentence?'''
]
data=[]
for i in documents:
  sentences=sent_tokenize(i)
  for j in sentences:
    word=word_tokenize(j.lower())
    data.append(word)

model1=gensim.models.Word2Vec(data, min_count=1,vector_size=100,window=5)
# You can find words that are similar to a given word based on their vector representation. This is useful for understanding semantic similarity
similar_words = model1.wv.most_similar('first')
print(similar_words)

[('this', 0.13887983560562134), ('is', 0.13149002194404602), ('sentence', 0.06408977508544922), ('third', 0.060591839253902435), ('second', 0.020000359043478966), ('?', 0.01915227249264717), ('.', 0.009391157887876034), ('one', -0.057745810598134995), ('the', -0.05983472615480423), ('and', -0.10513810813426971)]


In [19]:
similar_words = model1.wv.most_similar('third')
print(similar_words)

[('sentence', 0.19912061095237732), ('is', 0.07497556507587433), ('first', 0.06059184670448303), ('and', 0.044689226895570755), ('.', 0.03364058583974838), ('the', 0.027060067281126976), ('second', 0.026806799694895744), ('?', 0.008826158009469509), ('this', -0.06900332123041153), ('one', -0.14454564452171326)]


In [20]:
# Calculate Word Vectors
# Access the vector representation of a word directly. This can be useful for further analysis or feeding into other machine learning models.
vector = model1.wv['first']
print(vector)

[-8.7276464e-03  2.1302013e-03 -8.7356055e-04 -9.3192635e-03
 -9.4283195e-03 -1.4107444e-03  4.4324915e-03  3.7041404e-03
 -6.4988150e-03 -6.8731965e-03 -4.9995058e-03 -2.2868870e-03
 -7.2504235e-03 -9.6034976e-03 -2.7436807e-03 -8.3629973e-03
 -6.0389889e-03 -5.6710350e-03 -2.3441815e-03 -1.7070292e-03
 -8.9571662e-03 -7.3521322e-04  8.1526591e-03  7.6905736e-03
 -7.2062509e-03 -3.6668999e-03  3.1186105e-03 -9.5709022e-03
  1.4764669e-03  6.5245889e-03  5.7465271e-03 -8.7632257e-03
 -4.5172288e-03 -8.1403134e-03  4.5957237e-05  9.2638070e-03
  5.9734173e-03  5.0674030e-03  5.0611575e-03 -3.2429779e-03
  9.5523624e-03 -7.3565622e-03 -7.2705238e-03 -2.2654315e-03
 -7.7857525e-04 -3.2161637e-03 -5.9259695e-04  7.4889632e-03
 -6.9753168e-04 -1.6249712e-03  2.7444507e-03 -8.3592571e-03
  7.8559509e-03  8.5362643e-03 -9.5842667e-03  2.4463122e-03
  9.9051567e-03 -7.6659475e-03 -6.9670491e-03 -7.7366619e-03
  8.3960807e-03 -6.8134867e-04  9.1445800e-03 -8.1583736e-03
  3.7431547e-03  2.63509

In [21]:
# Compute Word Pair Similarity
# Calculate the similarity between two words using their vector representations. This gives a measure of how semantically close the words are.
similarity = model1.wv.similarity('first', 'second')
print(similarity)

0.020000365
