<a href="https://colab.research.google.com/github/Jatingpt/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
 #1. Text Cleaning and Preprocessing
 #2. Tokenization- Word Tokenization and Sentence Tokenization
 #3. Stopwords Removal
 #4. Lemmatization and Stemming
 #5. Named Entity Recognition using spaCy and NLTK
 #6. POS Tagging
 #7. TF-IDF Vectorization
 #8. n-grams
 #9. Text Classification using Naive Bayes
 #7. Word Embeddings: Word2Vec (Gensim)
 #8. Named Entity Recognition using spaCy
 #9. Sentiment Analysis using TextBlob
 #10. Text Generation using GPT-2 (Transformers)
 #11. Topic Modeling with LDA (Gensim)
 #12. POS Tagging

In [3]:
#Solution1 -                                     -- Text Cleaning and Preprocessing --
import re
import string
import pandas as pd

# Sample data
texts = ["Hello!!! This is @Jatin. I love NLP... #awesome", "Text-cleaning is crucial :)"]

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)  # Remove hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_texts = [clean_text(t) for t in texts]
print(cleaned_texts)

['hello this is i love nlp', 'textcleaning is crucial']


In [9]:
#Solution 2-                                    -- Tokenization --
sentence = "hey how are you, is everything good?"

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

wt = word_tokenize(sentence)
wt

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['hey', 'how', 'are', 'you', ',', 'is', 'everything', 'good', '?']

In [12]:
sentence = "Hello guys. I I hope you all are fine. We will meet together."
from nltk.tokenize import sent_tokenize

st = sent_tokenize(sentence)
st

['Hello guys.', 'I I hope you all are fine.', 'We will meet together.']

In [22]:
#Solution 3-                           -- Stopwords Removal --
sentence = "Hey hi, a bag is not enough for these things."
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = stopwords.words("english")
stop_words

sentence = sentence.lower()

sentence = sentence.split()

for i in sentence:
  if i not in stop_words:
    print(i)

hey
hi,
bag
enough
things.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
#Solution 4                                      -- Lemmatization --
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

wl = WordNetLemmatizer()
wl.lemmatize("mice")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'mouse'

In [14]:
#Solution 4 Continued-                       -- Stemming --
from nltk.stem import LancasterStemmer, PorterStemmer, RegexpStemmer, SnowballStemmer

#Creating a variable for them
l = LancasterStemmer()
p = PorterStemmer()
r = RegexpStemmer('ing')
s = SnowballStemmer("english")

l.stem("Changing")
r.stem('meaning')
s.stem('changing')
p.stem('learning')

'learn'

In [None]:
#Solution 5-                               -- NER Using SpaCy --
# import spacy
# nlp = spacy.load("en_core_web_sm")

# text = "A tree is looking at the human, please don't cut me! I will give you 100$"
# doc = nlp(text)

# for ent in doc.ents:
#   print(ent.text, ent.label_)



import spacy
nlp = spacy.load('en_core_web_sm')

text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

In [28]:
#Solution 5-                                               -- NER Using NLTK --
import nltk
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk

# Sample sentence
text = "Barack Obama was the 44th President of the United States. He was born in Hawaii."

# Step 1: Tokenize the sentence
words = nltk.word_tokenize(text)

# Step 2: Part-of-Speech (POS) tagging
pos_tags = nltk.pos_tag(words)

# Step 3: Named Entity Recognition (chunking)
named_entities = nltk.ne_chunk(pos_tags)

# Step 4: Print Named Entities
print("Named Entities found:")
for chunk in named_entities:
    if hasattr(chunk, 'label'):
        print(f"{chunk.label()} : {' '.join(c[0] for c in chunk)}")

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Named Entities found:
PERSON : Barack
PERSON : Obama
GPE : United States
GPE : Hawaii


In [30]:
#Solution 6-                                               -- POS Tagging --
from nltk.tokenize import word_tokenize
sentence = "Hey ravi, let us go Goa."
w = word_tokenize(sentence)
w

from nltk import pos_tag
p = pos_tag(w)
p

[('Hey', 'NNP'),
 ('ravi', 'NN'),
 (',', ','),
 ('let', 'VB'),
 ('us', 'PRP'),
 ('go', 'VB'),
 ('Goa', 'NNP'),
 ('.', '.')]

In [31]:
#Solution 7-                             -- TFIDF --
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["I love NLP and Machine Learning", "NLP is a part of artificial intelligence."]
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(x.toarray())

['and' 'artificial' 'intelligence' 'is' 'learning' 'love' 'machine' 'nlp'
 'of' 'part']
[[0.47107781 0.         0.         0.         0.47107781 0.47107781
  0.47107781 0.33517574 0.         0.        ]
 [0.         0.4261596  0.4261596  0.4261596  0.         0.
  0.         0.30321606 0.4261596  0.4261596 ]]


In [45]:
#Solution 6                                    -- Text Classificationn Using Naive Bayes --
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.DataFrame({
    'text': ['I love python', 'NLP is awesome', 'I hate bugs', 'Debugging is painful'],
    'label': [1, 1, 0, 0]  # 1: Positive, 0: Negative
})

x = TfidfVectorizer().fit_transform(data["text"])
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.25)
model = MultinomialNB()
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
#Solution 7-                                    -- Word Embeddings(Word2Vec) --
!pip install gensim

from gensim.models import Word2Vec

# Small training data (list of tokenized sentences)
sentences = [['I', 'love', 'machine', 'learning'], ['NLP', 'is', 'awesome']]

# Create the Word2Vec model
model = Word2Vec(sentences, vector_size=10, window=2, min_count=1, workers=1)

# Print the word vector for 'learning'
print(model.wv['learning'])  # A 10-dimensional vector



#vector_size=10 → Each word will be represented as a 10-dimensional vector
#window=2 → Looks at 2 words before and after the target word
#min_count=1 → Even rare words (like those occurring once) will be included
#workers=1 → Uses 1 CPU core for training

[-0.07511582 -0.00930042  0.09538119 -0.07319167 -0.02333769 -0.01937741
  0.08077437 -0.05930896  0.00045162 -0.04753734]


In [13]:
#Solution 8-                               -- NER Using SpaCy --
# import spacy
# nlp = spacy.load("en_core_web_sm")

# text = "A tree is looking at the human, please don't cut me! I will give you 100$"
# doc = nlp(text)

# for ent in doc.ents:
#   print(ent.text, ent.label_)



import spacy
nlp = spacy.load('en_core_web_sm')

text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [26]:
#Solution 8-                                   -- NER Using NLTK --
import nltk
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk

# Sample sentence
text = "Barack Obama was the 44th President of the United States. He was born in Hawaii."

# Step 1: Tokenize the sentence
words = nltk.word_tokenize(text)

# Step 2: Part-of-Speech (POS) tagging
pos_tags = nltk.pos_tag(words)

# Step 3: Named Entity Recognition (chunking)
named_entities = nltk.ne_chunk(pos_tags)

# Step 4: Print Named Entities
print("Named Entities found:")
for chunk in named_entities:
    if hasattr(chunk, 'label'):
        print(f"{chunk.label()} : {' '.join(c[0] for c in chunk)}")

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Named Entities found:
PERSON : Barack
PERSON : Obama
GPE : United States
GPE : Hawaii


In [16]:
#Solution 9-                                    -- Sentiment Analysis (Using TextBlob) --
from textblob import TextBlob
sentence = "I hate you"

blob = TextBlob(sentence)
print(blob.sentiment)

Sentiment(polarity=-0.8, subjectivity=0.9)


In [15]:
#Solution 10-                                    -- Text Generation Using Transformers (GPT 2) --

from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
print(generator("NLP is the future of", truncation=True,  max_length=30, num_return_sequences=1)[0]['generated_text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


NLP is the future of science. A growing number of scientists are beginning to realise that it is possible to make progress, albeit with the aid of some very different methods.

This section should not be considered as a comprehensive description of the methods that are currently used to improve the understanding of quantum mechanics. Instead, the sections will be expanded in a way that allows readers to find out about the current state of knowledge.

1. Introduction

In this section we will be discussing the various techniques that scientists use to improve their understanding of the quantum field of view. We will discuss the various methods that scientists use to improve their understanding of the quantum field of view, but first, we will discuss the various methods that scientists use to improve their understanding of the quantum field of view.

2. Quantum theory and quantum optics

According to quantum theory, the field of view of a star is the same as that of a star. When a star pa

In [1]:
#Solution 11-                                    -- Text Similarity --
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = ["I love NLP", "I enjoy natural language processing"]
vec = TfidfVectorizer().fit_transform(texts)
sim = cosine_similarity(vec)
print(sim)


[[1. 0.]
 [0. 1.]]


In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer, RegexpStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
sentence = "The Importance of Saving Water. My Favorite Book and What It Taught Me"
tok = word_tokenize(sentence)
tok

pt = pos_tag(tok)
pt

[('The', 'DT'),
 ('Importance', 'NNP'),
 ('of', 'IN'),
 ('Saving', 'NNP'),
 ('Water', 'NNP'),
 ('.', '.'),
 ('My', 'PRP$'),
 ('Favorite', 'JJ'),
 ('Book', 'NNP'),
 ('and', 'CC'),
 ('What', 'WP'),
 ('It', 'PRP'),
 ('Taught', 'VBD'),
 ('Me', 'NNP')]

In [None]:
l = LancasterStemmer()
a = "changing"
root = l(a)

TypeError: 'LancasterStemmer' object is not callable

##**What is NLP?**

##**NLP is a technology which is used by machine to understand, analyse and manipulate human language.**

##**It is a combination of Computer Science, Artificial Intelligence and Human Language.**

##**Applications of NLP- Alexa, Siri, Google Assistance, Google Translator, To checking the span messages etc.**

##**Components of NLP->**
                          1. NLU(Natural Language Understanding)
                          2. NLG(Natural Language Generation)

1. NLU- It is working on the probability of texts or searching that how many times that we have searched anything(e.g if we pressed "G" on google then it will automatically suggesting the "Google").

2. NLG- It is basically generating something. E.g Google Translator.

##**What are the challenges faced in NLP?**

There are many challenges faced in NLP.

1. Synonyms(Can't get the difference in Synos)
2. Contextual Words(Difference in between "Good" and "Better".)
3. Ambiguity(Hard to understand the emotion of the sentence.)
4. Lack of research and developement.




##**Libraries that used in NLP- ScikitLearn, NLTK, Spacy, Tensorflow etc.**

###The first major concern is from where we can get the data.

###So we can get the data from the company itself, The data from the APIs, Web Scrapping and we can also do the survey to get the data.

These are the steps to create a Pipeline or the procedure to create a NLP Model.

1.  DATA COLLECTION(From Google, Web, Company's Data.)

2.  Data Cleaning(Removing Stop Words, Punctuation Etc.)

3.  Data Preprocessing(Tokenization, Removing Digits, Creating Phrase)

4.  Feature Engineering
(Converting our data into binary numbers and removing the unimportant features.)

5.  Build Model/Modeling(We can build the models by using different libraries)

6.  Evaluation(Testing the models by Cross Validation, Random Cross Validation)

7.  Deployment(On different website, cloud, AWS, Azure.)

8.  Monetering and Updating(We can update and monitor our models in AWS or on Clouds)

##**Tokenization**

##**What is PUNKT**- It is a module called PUNKT available in the NLTK. NLTK (Natural Language Toolkit) is used in Python to implement programs under the domain of Natural Language Processing. It contains a variety of libraries for various purposes like text classification, parsing, stemming, tokenizing, etc.


##In NLTK, PUNKT is an unsupervised trainable model, which means it can be trained on unlabeled data (Data that has not been tagged with information identifying its characteristics, properties, or categories is referred to as unlabeled data.)

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

x = "The king of Mathura, a kingdom established by the Vrishni tribes. Ugrasena's son was Kamsa, who imprisoned Ugrasena and took over the kingdom"
w = word_tokenize(x)
w


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['The',
 'king',
 'of',
 'Mathura',
 ',',
 'a',
 'kingdom',
 'established',
 'by',
 'the',
 'Vrishni',
 'tribes',
 '.',
 'Ugrasena',
 "'s",
 'son',
 'was',
 'Kamsa',
 ',',
 'who',
 'imprisoned',
 'Ugrasena',
 'and',
 'took',
 'over',
 'the',
 'kingdom']

##**Parts of Speech.**

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
p = pos_tag(w)    #In this section by this library we can giving some scodes to the words. E.g The=DT, King=NN, of=IN like this which is
p

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('The', 'DT'),
 ('king', 'NN'),
 ('of', 'IN'),
 ('Mathura', 'NNP'),
 (',', ','),
 ('a', 'DT'),
 ('kingdom', 'NN'),
 ('established', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('Vrishni', 'NNP'),
 ('tribes', 'NN'),
 ('.', '.'),
 ('Ugrasena', 'NNP'),
 ("'s", 'POS'),
 ('son', 'NN'),
 ('was', 'VBD'),
 ('Kamsa', 'NNP'),
 (',', ','),
 ('who', 'WP'),
 ('imprisoned', 'VBD'),
 ('Ugrasena', 'NNP'),
 ('and', 'CC'),
 ('took', 'VBD'),
 ('over', 'RP'),
 ('the', 'DT'),
 ('kingdom', 'NN')]

In [None]:
from nltk.tokenize import word_tokenize
var = "The king of Mathura, a kingdom established by the Vrishni tribes. Ugrasena's son was Kamsa, who imprisoned Ugrasena and took over the kingdom"
var_new = word_tokenize(var)
var_new

['The',
 'king',
 'of',
 'Mathura',
 ',',
 'a',
 'kingdom',
 'established',
 'by',
 'the',
 'Vrishni',
 'tribes',
 '.',
 'Ugrasena',
 "'s",
 'son',
 'was',
 'Kamsa',
 ',',
 'who',
 'imprisoned',
 'Ugrasena',
 'and',
 'took',
 'over',
 'the',
 'kingdom']

##**How to remove the Stop Words?**


In [None]:
var = "The king of Mathura, a kingdom established by the Vrishni tribes. Ugrasena's son was Kamsa, who imprisoned Ugrasena and took over the kingdom"

from nltk.tokenize import word_tokenize
var_new = word_tokenize(var)

from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

from string import punctuation

stop = stopwords.words("english")

stop_word_list = list(punctuation) + stop  #By this we got the list of punctuation and all the stop words in english.
stop_word_list

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 'i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from

In [None]:
#Writing a function to get the tokens or words without stopwords and punctuation.
for i in var:
  if i not in stop_word_list:
    print(i)

T
h
e
 
k
n
g
 
f
 
M
h
u
r
 
 
k
n
g
 
e
b
l
h
e
 
b
 
h
e
 
V
r
h
n
 
r
b
e
 
U
g
r
e
n
 
n
 
w
 
K
 
w
h
 
p
r
n
e
 
U
g
r
e
n
 
n
 
k
 
v
e
r
 
h
e
 
k
n
g


##**Stemming and Lemmatization.**

##**Stemming**
###Stemming is a technique used to extract the base from the words by removing the affixes from them.

###There are four types of Stemming algorithms-
1. Porterstemmer
2. Regexpstemmer(We have to provide the default parameters for removing e.g "ing")
3. Snowballstemmer(It works in 15 different types of language.)
4. Lancasterstemmer.

We willl see all this with the help of examples.

##**Lemmatization**
##Lemmatization is same as Stemming but lemmatization gives us the meaningful word. The output of lemmatization is called 'Lemma'.

In [None]:
from nltk.stem import LancasterStemmer, RegexpStemmer, PorterStemmer, SnowballStemmer
l = LancasterStemmer()
r = RegexpStemmer("ing")
p = PorterStemmer
s = SnowballStemmer("english")

In [None]:
print(l.stem("changing"))
print(r.stem("changing"))
print(s.stem("changing"))
#print(p.stem("Changing"))
print(l.stem("studying"))
print(r.stem("studying"))
print(s.stem("studying"))
print(l.stem("mice"))

chang
chang
chang
study
study
studi
mic


In [None]:
##E.g of Lemmatization.
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

wl.lemmatize("mice")

wl.lemmatize("studying")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'studying'

##**N-grams**
e.g An autosuggestions are coming while typing in Gboard keyboard.

It is working on the probability of the text that we are searching and using the most.

suppose, we types G -suggestion is -google(This is the example of Unigrams.)

same as bigrams and trigrams.


In [None]:
x = "I am bunty, i am a boy, i am a good person"

from nltk.tokenize import word_tokenize
w = word_tokenize(x)
w

['I',
 'am',
 'bunty',
 ',',
 'i',
 'am',
 'a',
 'boy',
 ',',
 'i',
 'am',
 'a',
 'good',
 'person']

In [None]:
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, ngrams
b = BigramCollocationFinder.from_words(w)
print(b.ngram_fd)
print(b.ngram_fd.keys())

<FreqDist with 10 samples and 13 outcomes>
dict_keys([('I', 'am'), ('am', 'bunty'), ('bunty', ','), (',', 'i'), ('i', 'am'), ('am', 'a'), ('a', 'boy'), ('boy', ','), ('a', 'good'), ('good', 'person')])


In [None]:
t = TrigramCollocationFinder.from_words(w)
t.ngram_fd

FreqDist({(',', 'i', 'am'): 2, ('i', 'am', 'a'): 2, ('I', 'am', 'bunty'): 1, ('am', 'bunty', ','): 1, ('bunty', ',', 'i'): 1, ('am', 'a', 'boy'): 1, ('a', 'boy', ','): 1, ('boy', ',', 'i'): 1, ('am', 'a', 'good'): 1, ('a', 'good', 'person'): 1})

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
sentence = "The Importance of Saving Water My Favorite Book and What It Taught Me"

token = word_tokenize(sentence)
token


['The',
 'Importance',
 'of',
 'Saving',
 'Water',
 'My',
 'Favorite',
 'Book',
 'and',
 'What',
 'It',
 'Taught',
 'Me']

In [None]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')
pos_tg = pos_tag(token)
pos_tg

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('The', 'DT'),
 ('Importance', 'NNP'),
 ('of', 'IN'),
 ('Saving', 'NNP'),
 ('Water', 'NNP'),
 ('My', 'NNP'),
 ('Favorite', 'NNP'),
 ('Book', 'NNP'),
 ('and', 'CC'),
 ('What', 'WP'),
 ('It', 'PRP'),
 ('Taught', 'VBD'),
 ('Me', 'NNP')]

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
s = "The Importance of Saving Water. My Favorite Book and What It Taught Me"
sen_tok = sent_tokenize(s)
sen_tok



['The Importance of Saving Water.', 'My Favorite Book and What It Taught Me']

In [None]:
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk import pos_tag
pos_tg = pos_tag(sen_tok)
pos_tg

[('The Importance of Saving Water.', 'NNP'),
 ('My Favorite Book and What It Taught Me', 'NNP')]