In [1]:
# corpus is a large and structured set of texts that are used for linguistic research  and analaysis.
# Documents are the single piece of text within the corpus.

text_data = '''Natural Language Processing (NLP) is a fascinating field that empowers computers to understand and generate human language. From analyzing social media sentiments to translating languages in real-time, NLP plays a crucial role in today's digital age. One of the fundamental tasks in NLP is tokenization, where sentences are broken down into individual tokens like words or phrases. This process facilitates further analysis such as identifying important keywords or phrases.

Stopwords are common words like "the", "is", "and" that occur frequently in a language but often carry little semantic meaning. Removing stopwords from text is essential to focus on meaningful content and improve the accuracy of NLP tasks. For instance, in sentiment analysis, stopwords are often filtered out to extract the true sentiment-bearing words.

Part-of-Speech (POS) tagging assigns grammatical categories to words in a sentence, such as nouns, verbs, adjectives, etc. This tagging helps in understanding the syntactic structure of sentences and is vital for tasks like parsing and information extraction.

Named Entity Recognition (NER) identifies and categorizes named entities in text into predefined categories such as names of persons, organizations, locations, dates, etc. For example, in the sentence "Apple is headquartered in Cupertino", NER would identify "Apple" as an organization and "Cupertino" as a location.

Practice with NLTK involves applying these techniques on text data to gain insights or perform specific tasks. By mastering tokenization, stopwords removal, POS tagging, and NER, you can enhance your ability to process and extract valuable information from text data efficiently.

'''
print(text_data)

Natural Language Processing (NLP) is a fascinating field that empowers computers to understand and generate human language. From analyzing social media sentiments to translating languages in real-time, NLP plays a crucial role in today's digital age. One of the fundamental tasks in NLP is tokenization, where sentences are broken down into individual tokens like words or phrases. This process facilitates further analysis such as identifying important keywords or phrases.

Stopwords are common words like "the", "is", "and" that occur frequently in a language but often carry little semantic meaning. Removing stopwords from text is essential to focus on meaningful content and improve the accuracy of NLP tasks. For instance, in sentiment analysis, stopwords are often filtered out to extract the true sentiment-bearing words.

Part-of-Speech (POS) tagging assigns grammatical categories to words in a sentence, such as nouns, verbs, adjectives, etc. This tagging helps in understanding the synta

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

# 1st step tokenization

#sent tokenizer
documents = sent_tokenize(text_data)


#word tokenizer
corpus = []
for i in range(len(documents)):
    words = word_tokenize(documents[i])
    corpus.append(words)

print(corpus)

[['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'that', 'empowers', 'computers', 'to', 'understand', 'and', 'generate', 'human', 'language', '.'], ['From', 'analyzing', 'social', 'media', 'sentiments', 'to', 'translating', 'languages', 'in', 'real-time', ',', 'NLP', 'plays', 'a', 'crucial', 'role', 'in', 'today', "'s", 'digital', 'age', '.'], ['One', 'of', 'the', 'fundamental', 'tasks', 'in', 'NLP', 'is', 'tokenization', ',', 'where', 'sentences', 'are', 'broken', 'down', 'into', 'individual', 'tokens', 'like', 'words', 'or', 'phrases', '.'], ['This', 'process', 'facilitates', 'further', 'analysis', 'such', 'as', 'identifying', 'important', 'keywords', 'or', 'phrases', '.'], ['Stopwords', 'are', 'common', 'words', 'like', '``', 'the', "''", ',', '``', 'is', "''", ',', '``', 'and', "''", 'that', 'occur', 'frequently', 'in', 'a', 'language', 'but', 'often', 'carry', 'little', 'semantic', 'meaning', '.'], ['Removing', 'stopwords', 'from', 'text',

In [3]:
#wordpunct_tokenizer and TreebankWordTokenizer

from nltk.tokenize import wordpunct_tokenize, TreebankWordTokenizer

wordpunct_corpus = wordpunct_tokenize(text_data)
print(wordpunct_corpus)


Treebankword_corpus = TreebankWordTokenizer().tokenize(text_data)
print(Treebankword_corpus)

['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'that', 'empowers', 'computers', 'to', 'understand', 'and', 'generate', 'human', 'language', '.', 'From', 'analyzing', 'social', 'media', 'sentiments', 'to', 'translating', 'languages', 'in', 'real', '-', 'time', ',', 'NLP', 'plays', 'a', 'crucial', 'role', 'in', 'today', "'", 's', 'digital', 'age', '.', 'One', 'of', 'the', 'fundamental', 'tasks', 'in', 'NLP', 'is', 'tokenization', ',', 'where', 'sentences', 'are', 'broken', 'down', 'into', 'individual', 'tokens', 'like', 'words', 'or', 'phrases', '.', 'This', 'process', 'facilitates', 'further', 'analysis', 'such', 'as', 'identifying', 'important', 'keywords', 'or', 'phrases', '.', 'Stopwords', 'are', 'common', 'words', 'like', '"', 'the', '",', '"', 'is', '",', '"', 'and', '"', 'that', 'occur', 'frequently', 'in', 'a', 'language', 'but', 'often', 'carry', 'little', 'semantic', 'meaning', '.', 'Removing', 'stopwords', 'from', 'text', 'is', 'essen

# Stemming and Lemmatization

In [4]:
# Stemming is all about converting the word into the word stem typically by removing the affixes while the keeping the stem meaningful in which final words are not meaningful
# Porterstemmer
# Snowballstemmer
# Regexpstemmer
# Stemming is helpful in sentimental analysis
 
from nltk.stem import PorterStemmer, SnowballStemmer, RegexpStemmer

porter_stem = PorterStemmer()
snowball_stem = SnowballStemmer('english')
regexp_stem = RegexpStemmer('$ing|le|s$', min=4)

#Porterstemmer
for word in wordpunct_corpus:
    print(f"{word} ===> {porter_stem.stem(word)}", end=', ')

print()
#Snowball stemmer
for word in wordpunct_corpus:
    print(f"{word}===>{snowball_stem.stem(word)}", end=', ')

print()
for word in wordpunct_corpus:
    print(f"{word} ===> {regexp_stem.stem(word)}", end=", ")


Natural ===> natur, Language ===> languag, Processing ===> process, ( ===> (, NLP ===> nlp, ) ===> ), is ===> is, a ===> a, fascinating ===> fascin, field ===> field, that ===> that, empowers ===> empow, computers ===> comput, to ===> to, understand ===> understand, and ===> and, generate ===> gener, human ===> human, language ===> languag, . ===> ., From ===> from, analyzing ===> analyz, social ===> social, media ===> media, sentiments ===> sentiment, to ===> to, translating ===> translat, languages ===> languag, in ===> in, real ===> real, - ===> -, time ===> time, , ===> ,, NLP ===> nlp, plays ===> play, a ===> a, crucial ===> crucial, role ===> role, in ===> in, today ===> today, ' ===> ', s ===> s, digital ===> digit, age ===> age, . ===> ., One ===> one, of ===> of, the ===> the, fundamental ===> fundament, tasks ===> task, in ===> in, NLP ===> nlp, is ===> is, tokenization ===> token, , ===> ,, where ===> where, sentences ===> sentenc, are ===> are, broken ===> broken, down ===>

In [5]:
# lemmatizer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for word in wordpunct_corpus:
    print(f"{word} ===> {lemmatizer.lemmatize(word)}", end=', ')

Natural ===> Natural, Language ===> Language, Processing ===> Processing, ( ===> (, NLP ===> NLP, ) ===> ), is ===> is, a ===> a, fascinating ===> fascinating, field ===> field, that ===> that, empowers ===> empowers, computers ===> computer, to ===> to, understand ===> understand, and ===> and, generate ===> generate, human ===> human, language ===> language, . ===> ., From ===> From, analyzing ===> analyzing, social ===> social, media ===> medium, sentiments ===> sentiment, to ===> to, translating ===> translating, languages ===> language, in ===> in, real ===> real, - ===> -, time ===> time, , ===> ,, NLP ===> NLP, plays ===> play, a ===> a, crucial ===> crucial, role ===> role, in ===> in, today ===> today, ' ===> ', s ===> s, digital ===> digital, age ===> age, . ===> ., One ===> One, of ===> of, the ===> the, fundamental ===> fundamental, tasks ===> task, in ===> in, NLP ===> NLP, is ===> is, tokenization ===> tokenization, , ===> ,, where ===> where, sentences ===> sentence, are

# Stop words

In [6]:
# tokenization ==> sent_tokenize, word_tokenize
#stemming ==> snowball_stemming
from nltk.corpus import stopwords

snow_corpus = []

documents = sent_tokenize(text_data)

for i in range(len(documents)):
    words = word_tokenize(documents[i])
    words = [snowball_stem.stem(word) for word in words if word not in set(stopwords.words('english'))]
    snow_corpus.append(words)

print(snow_corpus)

###########################################
from nltk.stem import WordNetLemmatizer

wordnet = WordNetLemmatizer()
word_net_corpus = []
for i in range(len(documents)):
    words = word_tokenize(documents[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    word_net_corpus.append(words)

print(word_net_corpus)

[['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'empow', 'comput', 'understand', 'generat', 'human', 'languag', '.'], ['from', 'analyz', 'social', 'media', 'sentiment', 'translat', 'languag', 'real-tim', ',', 'nlp', 'play', 'crucial', 'role', 'today', "'s", 'digit', 'age', '.'], ['one', 'fundament', 'task', 'nlp', 'token', ',', 'sentenc', 'broken', 'individu', 'token', 'like', 'word', 'phrase', '.'], ['this', 'process', 'facilit', 'analysi', 'identifi', 'import', 'keyword', 'phrase', '.'], ['stopword', 'common', 'word', 'like', '``', "''", ',', '``', "''", ',', '``', "''", 'occur', 'frequent', 'languag', 'often', 'carri', 'littl', 'semant', 'mean', '.'], ['remov', 'stopword', 'text', 'essenti', 'focus', 'meaning', 'content', 'improv', 'accuraci', 'nlp', 'task', '.'], ['for', 'instanc', ',', 'sentiment', 'analysi', ',', 'stopword', 'often', 'filter', 'extract', 'true', 'sentiment-bear', 'word', '.'], ['part-of-speech', '(', 'pos', ')', 'tag', 'assign', 'grammat', 'c

# Parts of Speech Tagging and Named Entity Recognition

In [7]:
from nltk import pos_tag, ne_chunk

for word in word_net_corpus:
    data = pos_tag(word)
    print(f"{word} ===> {data}")
    ner = ne_chunk(data)
    print(f"{word} ===> {ner}")
    ner.draw()
    print("=="*30)

['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'empowers', 'computer', 'understand', 'generate', 'human', 'language', '.'] ===> [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('fascinating', 'VBG'), ('field', 'NN'), ('empowers', 'NNS'), ('computer', 'NN'), ('understand', 'VBP'), ('generate', 'NN'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'empowers', 'computer', 'understand', 'generate', 'human', 'language', '.'] ===> (S
  Natural/JJ
  Language/NNP
  Processing/NNP
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  fascinating/VBG
  field/NN
  empowers/NNS
  computer/NN
  understand/VBP
  generate/NN
  human/JJ
  language/NN
  ./.)
['From', 'analyzing', 'social', 'medium', 'sentiment', 'translating', 'language', 'real-time', ',', 'NLP', 'play', 'crucial', 'role', 'today', "'s", 'digital', 'age', '.'] ===> [('From', 'IN'), ('

# Solving a problem using Machine learning and NLP methods

# Topics covered are:
Data cleaning
1. tokenization ==> wordpunct_tokenizer
2. stopwords
3. lemmatization ==> wordnet_lemmatizer
4. Bag of words ==> normat bag of words.
5. TF-IDF Term Frequency and inverse document frequency.

In [8]:
# load the data
import pandas as pd

df = pd.read_csv(r"C:\Users\venkatesh\OneDrive\Documents\Learning_AI_ML\Krish_naik_NLP\smsspamcollection\SMSSpamCollection", sep='\t', names=['label', 'text'])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venkatesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# data cleaning RLTSL

import re
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wordnet = WordNetLemmatizer()
corpus = []
for i in range(len(df)):
    data = re.sub('[^a-zA-Z]', ' ', df['text'][i]) # Removing the unwanted characters
    data = data.lower() #lowering the sentences
    data = wordpunct_tokenize(data) # tokenizing ==> which is splitting into smaller units
    data = [ wordnet.lemmatize(word) for word in data if word not in set(stopwords.words('english'))]
    data = " ".join(data)
    corpus.append(data)

corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

# First Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

y = pd.get_dummies(df['label'])
y = y.iloc[:,0].values

X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2)

# Bag of words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, ngram_range=(1, 2))

X_bow = cv.fit_transform(X_train).toarray() # type: ignore
X_test_bow = cv.transform(X_test).toarray() # type: ignore



# TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=2500, ngram_range=(1, 2))

X_tfidf = tv.fit_transform(X_train).toarray()
X__test_tfidf = tv.transform(X_test).toarray()


In [14]:
# function to train the Data with ml algorithm

def Apply_ml_techn(X_train, y_train, X_test, y_test):

    

    from sklearn.naive_bayes import MultinomialNB

    model = MultinomialNB().fit(X_train, y_train)

    y_pred = model.predict(X_test)

    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    cr = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print(cr, cm, accuracy, sep="\n")



In [15]:
#passing the Bag of words vector data
Apply_ml_techn(X_bow, y_train, X_test_bow, y_test)


              precision    recall  f1-score   support

       False       0.96      0.92      0.94       161
        True       0.99      0.99      0.99       954

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[148  13]
 [  6 948]]
0.9829596412556054


In [16]:
#Passing the tfIdf vector data
Apply_ml_techn(X_tfidf, y_train, X__test_tfidf, y_test)


              precision    recall  f1-score   support

       False       1.00      0.83      0.91       161
        True       0.97      1.00      0.99       954

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.97      1115

[[134  27]
 [  0 954]]
0.9757847533632287
