In [1]:
#Dividing a block/body of text into words or sentences is known as tokenization

import nltk
nltk.download('punkt')

block = "CSI-DYPIEMR is the Student Chapter of Computer Society of India in Dr. D. Y. Patil Pratishthan's Dr. D. Y. Patil Institute of Engineering, Management, and Research. Computer Society of India is a body of computer professionals in India. It was started on 6 March 1965 by a few computer professionals and has now grown to be the national body representing computer professionals. It has 72 chapters across India, 511 student branches, and 100,000 members."

print("This is word wise tokenization-:",'\n', nltk.word_tokenize(block), '\n')

print("x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x", '\n')

print("This is sentence wise tokenization-:",'\n', nltk.sent_tokenize(block))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
This is word wise tokenization-: 
 ['CSI-DYPIEMR', 'is', 'the', 'Student', 'Chapter', 'of', 'Computer', 'Society', 'of', 'India', 'in', 'Dr.', 'D.', 'Y.', 'Patil', 'Pratishthan', "'s", 'Dr.', 'D.', 'Y.', 'Patil', 'Institute', 'of', 'Engineering', ',', 'Management', ',', 'and', 'Research', '.', 'Computer', 'Society', 'of', 'India', 'is', 'a', 'body', 'of', 'computer', 'professionals', 'in', 'India', '.', 'It', 'was', 'started', 'on', '6', 'March', '1965', 'by', 'a', 'few', 'computer', 'professionals', 'and', 'has', 'now', 'grown', 'to', 'be', 'the', 'national', 'body', 'representing', 'computer', 'professionals', '.', 'It', 'has', '72', 'chapters', 'across', 'India', ',', '511', 'student', 'branches', ',', 'and', '100,000', 'members', '.'] 

x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x 

This is sentence wise token

In [2]:
# In the above example we have used two methods (word_tokenize & sent_tokenize) from the nltk library to demonstrate
# word wise tokenization and sentence wise tokenization. The punkt module is downloaded to aid in the recognition of
# punctuations.

In [3]:
# Stopwords are those types of words that don't have much meaningful contribution in the sentence like "a", "it's",
# "is", "the" etc. Presence of large amount of these types of words act as a form of noise in the dataset, that's why
# their removal is important

from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = stopwords.words('english') 
# This function contains the entire list of stop words present inside different languages, for our use case, we'll
#focus on english stopwords

token = nltk.word_tokenize(block)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
        
print("This is the unclean version-:",'\n',  token, '\n')

print("x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x", '\n')

print("This is the cleaned version-:",'\n', cleaned_token)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
This is the unclean version-: 
 ['CSI-DYPIEMR', 'is', 'the', 'Student', 'Chapter', 'of', 'Computer', 'Society', 'of', 'India', 'in', 'Dr.', 'D.', 'Y.', 'Patil', 'Pratishthan', "'s", 'Dr.', 'D.', 'Y.', 'Patil', 'Institute', 'of', 'Engineering', ',', 'Management', ',', 'and', 'Research', '.', 'Computer', 'Society', 'of', 'India', 'is', 'a', 'body', 'of', 'computer', 'professionals', 'in', 'India', '.', 'It', 'was', 'started', 'on', '6', 'March', '1965', 'by', 'a', 'few', 'computer', 'professionals', 'and', 'has', 'now', 'grown', 'to', 'be', 'the', 'national', 'body', 'representing', 'computer', 'professionals', '.', 'It', 'has', '72', 'chapters', 'across', 'India', ',', '511', 'student', 'branches', ',', 'and', '100,000', 'members', '.'] 

x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x-o-x 

This is the cleaned versi

In [4]:
# A sentence may contain words that convey the same meaning but are written in different forms, taking for example 
# verbs in different forms of tenses like 'running', 'ran', 'run', 'runs' ultimately convey the same meaning but 
# are written in different forms to suit the different types of tenses, for computer analysis these types of words 
# are kept under same section as their base form (which in this case will be "run") and this process is known as stemming

In [5]:
from nltk.stem import PorterStemmer

stemmer = nltk.PorterStemmer()

words = ['rain', 'rained', 'raining', 'rains']

stemmed = [stemmer.stem(word) for word in words]

print(stemmed)

['rain', 'rain', 'rain', 'rain']


In [6]:
# POS Tagging also known as parts of speech tagging refers to the process of tagging the tokens that we have created

In [7]:
from nltk import pos_tag 
nltk.download('averaged_perceptron_tagger')

tagged = nltk.pos_tag(cleaned_token)     

print(tagged)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('CSI-DYPIEMR', 'JJ'), ('Student', 'NNP'), ('Chapter', 'NNP'), ('Computer', 'NNP'), ('Society', 'NNP'), ('India', 'NNP'), ('Dr.', 'NNP'), ('D.', 'NNP'), ('Y.', 'NNP'), ('Patil', 'NNP'), ('Pratishthan', 'NNP'), ("'s", 'POS'), ('Dr.', 'NNP'), ('D.', 'NNP'), ('Y.', 'NNP'), ('Patil', 'NNP'), ('Institute', 'NNP'), ('Engineering', 'NNP'), (',', ','), ('Management', 'NNP'), (',', ','), ('Research', 'NNP'), ('.', '.'), ('Computer', 'NNP'), ('Society', 'NNP'), ('India', 'NNP'), ('body', 'NN'), ('computer', 'NN'), ('professionals', 'NNS'), ('India', 'NNP'), ('.', '.'), ('It', 'PRP'), ('started', 'VBD'), ('6', 'CD'), ('March', 'NNP'), ('1965', 'CD'), ('computer', 'NN'), ('professionals', 'NNS'), ('grown', 'VBP'), ('national', 'JJ'), ('body', 'NN'), ('representing', 'VBG'), ('computer', 'NN'), ('professionals', 'NNS'), ('.', '.'), ('It', 

In [8]:
# Lemmatization is the process of finding the form of the related word in the dictionary. Lemmatization does not 
# simply chop off inflections. Instead, it uses lexical knowledge bases to get the correct base forms of words.

In [9]:
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = nltk.WordNetLemmatizer()

lemmatized = [lemmatizer.lemmatize(word) for word in cleaned_token]

print(lemmatized)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
['CSI-DYPIEMR', 'Student', 'Chapter', 'Computer', 'Society', 'India', 'Dr.', 'D.', 'Y.', 'Patil', 'Pratishthan', "'s", 'Dr.', 'D.', 'Y.', 'Patil', 'Institute', 'Engineering', ',', 'Management', ',', 'Research', '.', 'Computer', 'Society', 'India', 'body', 'computer', 'professional', 'India', '.', 'It', 'started', '6', 'March', '1965', 'computer', 'professional', 'grown', 'national', 'body', 'representing', 'computer', 'professional', '.', 'It', '72', 'chapter', 'across', 'India', ',', '511', 'student', 'branch', ',', '100,000', 'member', '.']
