In [75]:
import pandas as pd
import re 
import string
import nltk

pd.set_option('display.max_colwidth', 200)

data = pd.read_csv("../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()  

data.head()

Unnamed: 0,label,Content
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [77]:
def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = " ".join([ps.stem(word) for word in tokens if word not in en_stopwords])
    return text

data['Stemmed_Content'] = data['Content'].apply(lambda x : clean_email(x))
data.head()

Unnamed: 0,label,Content,Stemmed_Content
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",go jurong point crazi avail bugi n great world la e buffet cine got amor wat
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah i dont think goe usf live around though


In [67]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorisation_full = CountVectorizer(ngram_range=(2, 2))
vect_final = ngram_vectorisation_full.fit_transform(data['Stemmed_Content'])
print(vect_final.shape)

(5572, 34162)


In [69]:
print(ngram_vectorisation_full.get_feature_names_out())

['008704050406 sp' '0089mi last' '0121 2025050' ... 'üll submit'
 'üll take' '〨ud even']


In [71]:
data0 = data[0:5]

ngram_vectorisation_0 = CountVectorizer(ngram_range=(2, 2))
vect_final_0 = ngram_vectorisation_0.fit_transform(data0['Stemmed_Content'])
print(vect_final_0.shape)

(5, 50)


In [72]:
print(ngram_vectorisation_0.get_feature_names_out())

['2005 text' '21st may' '87121 receiv' 'alreadi say' 'amor wat'
 'appli 08452810075over18' 'around though' 'avail bugi' 'buffet cine'
 'bugi great' 'cine got' 'comp win' 'crazi avail' 'cup final' 'dont think'
 'dun say' 'earli hor' 'entri questionstd' 'entri wkli' 'fa 87121'
 'fa cup' 'final tkt' 'free entri' 'go jurong' 'goe usf' 'got amor'
 'great world' 'hor alreadi' 'joke wif' 'jurong point' 'la buffet'
 'lar joke' 'live around' 'may 2005' 'nah dont' 'ok lar' 'point crazi'
 'questionstd txt' 'ratetc appli' 'receiv entri' 'say earli' 'text fa'
 'think goe' 'tkt 21st' 'txt ratetc' 'usf live' 'wif oni' 'win fa'
 'wkli comp' 'world la']


In [74]:
de_vect_final_0 = pd.DataFrame(vect_final_0.toarray())
de_vect_final_0.columns = ngram_vectorisation_0.get_feature_names_out()
de_vect_final_0

Unnamed: 0,2005 text,21st may,87121 receiv,alreadi say,amor wat,appli 08452810075over18,around though,avail bugi,buffet cine,bugi great,...,say earli,text fa,think goe,tkt 21st,txt ratetc,usf live,wif oni,win fa,wkli comp,world la
0,0,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,1,1,0,0,1,0,0,0,0,...,0,1,0,1,1,0,0,1,1,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
