# NLTK

### NLTK is a Natural Language Toolkit. It is a toolkit containing many packages and tools for Language Processing


In [3]:
import nltk
import os
base_path = "datasets"

def getDataset(dataset):
    return os.path.join(base_path,dataset)


In [4]:
# This will open a new window with options on which packages to install. I chose all
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
# Print out attributes and methods in this package
dir(nltk)

['ARLSTem',
 'ARLSTem2',
 'AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'Cistem',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyG

In [6]:
from nltk.corpus import stopwords
# print the english stopwords with jumps of 25 until reaches 500 (there are 179 overall in english so we'll only get 8)
stopwords.words('english')[0:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

### Reading in text data and cleaning the text

In [7]:
# Read in the raw text file from a dataset
# The dataset is a collection of texts that can either be classified as Spam or Ham
raw_data = open(getDataset("SMSSpamCollection.tsv"),encoding="utf8").read()
raw_data[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [8]:
# Replace each tab with new-line and parse the data by newlines
# The reason we are doing the replace first, becuae \t seperates the label (ham,spam) and the text, and by looking at the text, it doens't have \n inside it.
# So this will create a list of label,text,label,text...
parsed_data = raw_data.replace('\t','\n').split('\n')
parsed_data[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [9]:
label_list = parsed_data[0::2]
text_list = parsed_data[1::2]
print(label_list[0:5])
print(text_list[0:5])
# There is a problem we should fix for the next step - the label_list got an empty entry at the last so we'll remove it so that both lists will have the same length
label_list.pop()

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


''

In [10]:
import pandas as pd
full_corpus = pd.DataFrame({
    'label':label_list,
    "body_text":text_list
})
full_corpus.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Exploring the dataset

In [11]:
# Analyzing the dataset
print(f"Corpus has {full_corpus.shape[0]} rows, and {full_corpus.shape[1]} columns")
print(f"Out of {full_corpus.shape[0]} rows: {len(full_corpus[full_corpus['label']=='spam'])} are spam and {len(full_corpus[full_corpus['label']=='ham'])} are ham")
print(f"Number of nulls in the label: {full_corpus['label'].isnull().sum()}")
print(f"Number of nulls in the text body: {full_corpus['body_text'].isnull().sum()}")

Corpus has 5570 rows, and 2 columns
Out of 5570 rows: 746 are spam and 4824 are ham
Number of nulls in the label: 0
Number of nulls in the text body: 0


# Implementing a ML pipeline to clean text

### Pre-processing Text Data - The preprocessing is usually consists of 4 basic steps in order to reduce the corpus and simplify the task

In [12]:
# Setting an option of how much we can see in a dataframe when we print it out
pd.set_option('display.max_colwidth',100)

# Different way to do the loading process we did
dataset = pd.read_csv(getDataset("SMSSpamCollection.tsv"),sep='\t',names=['label','body_text'])
dataset.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### 1. Remove Punctuation

We need to remove punctuations becuase "I like NLP" is the same of us as "I like NLP." (with period)

In [13]:
import string
# Getting punctuations
string.punctuation
def remove_punct(text):
    # return every character in the text if it is not punctuation
    # To rebuild the sentece, we will join the characters in the list without any seperator
    text_nonpunct = "".join([char for char in text if char not in string.punctuation])
    return text_nonpunct

dataset['body_text_clean'] = dataset['body_text'].apply(lambda x: remove_punct(x))
dataset.head()


Unnamed: 0,label,body_text,body_text_clean
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


### 2. Tokenization

In tokenization, we are splitting sentences into an array of words so the model will analyze each word individually instead of analyzing a complete sentence as string

In [14]:
import re

def tokenize(text):
    # Regex for splitting text with non-words marks (like tabs, spaces, \n etc.)
    tokens = re.split('\W+',text)
    return tokens
dataset['body_text_tokenized'] = dataset['body_text_clean'].apply(lambda x: tokenize(x.lower()))
dataset.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


### 3. Remove stopwords

stopwords are words that doesn't have a special meaning in the sentence so we'd like to remove them in order to reduce the corpus
* stopwords could be - the, if, am etc.

In [15]:
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text
dataset['body_text_nonstop'] = dataset['body_text_tokenized'].apply(lambda x:remove_stopwords(x))
dataset.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nonstop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"


### The 4th step of the preprocess can be done in 2 ways: Stemming and Lemmatizing

Each way tries to achieve the same goal but in different way and with different accuracy

### 4.a. Stemming

In stemming, we will try to extract the base of words in order to find correlations between words and simplify the corpus

* For example, for "walker" and "walking", we would like to extract the stem "walk". This will help the model to understand the correlation between those words
* There are many types of stemmers, each has different set of rules. we weill use the most common stemmer - Porter Stemmer. Even though the stemmer is the most common, its set of rules does have some problems, and sometimes it might be better to use a different stemmer

In [18]:
ps = nltk.PorterStemmer()
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

dataset['body_text_stemmed'] = dataset['body_text_nonstop'].apply(lambda x: stemming(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nonstop,body_text_stemmed
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom...","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]"


### 4.b. Lemmatizing

In Lemmataizing, we will try to group together the inflected forms of words so they can be analyzed as a single term, identified by the word’s lemma

* Lemmatizer are more computationally expansive than stemmers, but more accurate
* Like stemmers, there are also many types of lemmataizers. We will use WordNet Lemmatizer

In [22]:
wn = nltk.WordNetLemmatizer()

# Difference between PorterStemmer and WordNet Lemmatizer
# In this example, the lemmatizier will work better than the stemmer
print(ps.stem("meanness"))
print(ps.stem("meaning"))
print(wn.lemmatize("meanness"))
print(wn.lemmatize("meaning"))

mean
mean
meanness
meaning


In [23]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text
dataset['body_text_lemmatized'] = dataset['body_text_nonstop'].apply(lambda x: lemmatizing(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_clean,body_text_tokenized,body_text_nonstop,body_text_stemmed,body_text_lemmatized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom...","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won...","[ive, searching, right, word, thank, breather, promise, wont, take, help, granted, fulfil, promi..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]","[date, sunday]"
