In [14]:
# Loading data from nltk to explore the package
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [15]:
# To see all the functions in the package. These are all the capabilities of nltk
dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGraph',
 'Depen

In [16]:
# To view a list of all stopwords - words that are used very frequently but do not contribute much to the meaninh of the sentence- they are sentiment neutral
from nltk.corpus import stopwords
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [33]:
# Reading unstructured data - like email file, social media post
# Python reads data like a string of characters

#Read dataset, I will be using UCI Machine Learning repository. This dataset is a collection of text messages, either with a label of either spam or ham

# Read in raw data
#https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
rawdata = open('SMSSpamCollection.tsv').read()
rawdata[0:500]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, he lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word bac"

In [34]:
# Replace \t with \n so that the data can be splitted
parsedData = rawdata.replace('\t','\n').split('\n')

In [35]:
parsedData[0:5]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam']

In [36]:
# We will extract only the messages from parsedData
labelList = parsedData[0::2]
textList = parsedData[1::2]

In [37]:
print(labelList[0:3])
print(textList[0:3])

['ham', 'ham', 'spam']
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]


In [38]:
# Now let's see how we can use these lists together
import pandas as pd

# create a data frame that contains columns for labels and text
fullCorpus = pd.DataFrame({
    'label': labelList,
    'body_list':textList
})

ValueError: arrays must all be same length

In [39]:
print(len(labelList))
print(len(textList))

5575
5574


In [40]:
# It might have picked up something in the very end
# Pick up last five items in the labelList
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


In [41]:
# The last entry is empty, we can just drop it
# Create a data frame to include all elements except the last one
fullCorpus = pd.DataFrame({
    'label': labelList[:-1],
    'body_list':textList
})

In [42]:
# print the first five entries
fullCorpus.head()

Unnamed: 0,body_list,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [46]:
# Another way to read the dataset using pandas

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header = None)
#header = none states that there is no header and the first line should be treated as normal text
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
# EXPLORE THE DATASET

# what is the shape of the dataset- this will tell the amount of data we have for building a model and the number of 
# columns that we are dealing with
print("Input data has {} rows and {} columns".format(len(fullCorpus),len(fullCorpus.columns)))

Input data has 5574 rows and 2 columns


In [49]:
# How many ham/spam are there?
print("Out of {} rows, {} are spam and {} are ham".format(len(fullCorpus),len(fullCorpus[fullCorpus['label']=='spam']),len(fullCorpus[fullCorpus['label']=='ham'])))

Out of 5574 rows, 747 are spam and 4827 are ham


In [52]:
# How much missing data is there?

print("No of null in label:{}".format(fullCorpus['label'].isnull().sum()))
print("No of null in text:{}".format(fullCorpus['body_list'].isnull().sum()))
#isnull returns true or false and sum will sum up all the nulls into a number

No of null in label:0
No of null in text:0
