In [1]:
# convert text to lowercase

# Text lowercase function
def lowercase_text(text):
    return text.lower()

# Test the function
input_str = "The 5 biggest countries by population in 2017 are China, India, United States, Indonesia, and Brazil."
print(lowercase_text(input_str))

the 5 biggest countries by population in 2017 are china, india, united states, indonesia, and brazil.


In [2]:
# remove numbers

import re

# Function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Test the function
input_str = "There are 3 balls in this bag, and 12 in the other one."
print(remove_numbers(input_str))

There are  balls in this bag, and  in the other one.


In [3]:
# numbers to text

import inflect

# Initialize inflect engine
q = inflect.engine()

# Function to convert numbers to text
def convert_number(text):
    words = text.split()
    return ' '.join([q.number_to_words(word) if word.isdigit() else word for word in words])

# Test the function
input_str = "There are 3 balls in this bag, and 12 in the other one."
print(convert_number(input_str))

There are three balls in this bag, and twelve in the other one.


In [4]:
# remove punctuation

import string

# Function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Test the function
input_str = "Hey, are you excited? :)"
print(remove_punctuation(input_str))

Hey are you excited 


In [5]:
# remove stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

# Test the function
ex_text = "This is an example showing off stop word filtration."
print(remove_stopwords(ex_text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This', 'example', 'showing', 'stop', 'word', 'filtration', '.']


In [6]:
# stemming

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# Initialize stemmer
stemmer = PorterStemmer()

# Function to apply stemming
def stem_words(text):
    tokens = word_tokenize(text)
    return [stemmer.stem(word) for word in tokens]

# Test the function
text = "Data science uses scientific methods, algorithms, and many types of processes."
print(stem_words(text))

['data', 'scienc', 'use', 'scientif', 'method', ',', 'algorithm', ',', 'and', 'mani', 'type', 'of', 'process', '.']


In [7]:
# lemmtization

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download WordNet if not already downloaded
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to apply lemmatization
def lemmatize_words(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(word, pos='v') for word in tokens]

# Test the function
text = "Data science uses scientific methods, algorithms, and many types of processes."
print(lemmatize_words(text))

[nltk_data] Downloading package wordnet to /root/nltk_data...


['Data', 'science', 'use', 'scientific', 'methods', ',', 'algorithms', ',', 'and', 'many', 'type', 'of', 'process', '.']


In [8]:
#POS tagging

from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk

# Download POS tagger if not already downloaded
nltk.download('averaged_perceptron_tagger')

# Function to perform POS tagging
def pos_tagging(text):
    tokens = word_tokenize(text)
    return pos_tag(tokens)

# Test the function
text = "You just gave me a pen."
print(pos_tagging(text))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('You', 'PRP'), ('just', 'RB'), ('gave', 'VBD'), ('me', 'PRP'), ('a', 'DT'), ('pen', 'NN'), ('.', '.')]


In [9]:
# chunking

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Function to perform chunking
def chunking(text, grammar):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    parser = nltk.RegexpParser(grammar)
    tree = parser.parse(tagged_tokens)
    for subtree in tree.subtrees():
        print(subtree)

# Define a simple grammar for noun phrases (NP)
grammar = "NP: {<DT>?<JJ>*<NN>}"

# Test the function
text = "The little red parrot is flying in the sky."
chunking(text, grammar)

(S
  (NP The/DT little/JJ red/JJ parrot/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN)
  ./.)
(NP The/DT little/JJ red/JJ parrot/NN)
(NP the/DT sky/NN)


In [10]:
# named entity recognition

from nltk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import nltk

# Download NER packages if not already downloaded
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Function for Named Entity Recognition
def ner(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    return ne_chunk(tagged_tokens)

# Test the function
text = "Barack Obama was the 44th President of the United States."
print(ner(text))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...


(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  the/DT
  44th/JJ
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


[nltk_data]   Unzipping corpora/words.zip.


In [11]:
# frequency distribution

from nltk import FreqDist
from nltk.tokenize import word_tokenize

# Function to calculate frequency distribution
def frequency_distribution(text):
    tokens = word_tokenize(text)
    fd = FreqDist(tokens)
    return fd

# Test the function
text = "Hello everyone. Welcome to GeeksforGeeks. You are studying NLP article."
fd = frequency_distribution(text)

# Display the frequency distribution
print(fd)
print("Frequency of 'everyone':", fd['everyone'])
print("Total unique words:", len(fd.keys()))

<FreqDist with 11 samples and 13 outcomes>
Frequency of 'everyone': 1
Total unique words: 11
