# Read in data

In [1]:
import pandas as pd

dataset = pd.read_csv('../Data/SMSSpamCollection.tsv', sep='\t', names=['label', 'message'])
dataset.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


# Function for cleaning data

In [6]:
import string
import re
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text


# Apply CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(dataset['message'])
print(X_counts.shape)
print(count_vect.get_feature_names_out())

(5568, 8107)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [8]:
print(count_vect.get_feature_names_out())

['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


# Apply CountVectorizer on a sample of data

In [9]:
data_sample = dataset[0:20]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['message'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names_out())

(20, 201)
['08002986030' '08452810075over18' '09061701461' '1' '100' '100000' '11'
 '12' '150pday' '16' '2' '20000' '2005' '21st' '3' '4' '4403ldnw1a7rw18'
 '4txtú120' '6day' '81010' '87077' '87121' '87575' '9' '900' 'aft' 'aid'
 'alreadi' 'anymor' 'appli' 'ard' 'around' 'b' 'bless' 'breather'
 'brother' 'call' 'caller' 'callertun' 'camera' 'cash' 'chanc' 'claim'
 'click' 'co' 'code' 'colour' 'comin' 'comp' 'copi' 'cost' 'credit' 'cri'
 'csh11' 'cup' 'custom' 'da' 'date' 'dont' 'eg' 'eh' 'england' 'enough'
 'entitl' 'entri' 'even' 'fa' 'feel' 'final' 'fine' 'finish' 'first'
 'free' 'friend' 'fulfil' 'go' 'goalsteam' 'goe' 'gonna' 'gota' 'grant'
 'ha' 'help' 'hl' 'home' 'hour' 'httpwap' 'im' 'info' 'ive' 'jackpot'
 'joke' 'k' 'kim' 'kl341' 'lar' 'latest' 'lccltd' 'like' 'link' 'live'
 'lor' 'lunch' 'macedonia' 'make' 'may' 'mell' 'membership' 'messag'
 'minnaminungint' 'miss' 'mobil' 'month' 'nah' 'name' 'nation' 'naughti'
 'network' 'news' 'next' 'nurungu' 'oh' 'oru' 'patent' 'pay' 'pe

# Output vectorizer as sparse matrix

In [10]:
X_counts_sample

<20x201 sparse matrix of type '<class 'numpy.int64'>'
	with 228 stored elements in Compressed Sparse Row format>

In [11]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_counts_df.columns = count_vect_sample.get_feature_names_out()
X_counts_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,winner,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# Apply CountVectorizer with N-Grams

In [13]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

# Clean data
dataset['message_cleaned'] = dataset['message'].apply(lambda x: clean_text(x))
dataset.head()

Unnamed: 0,label,message,message_cleaned
0,ham,I've been searching for the right words to tha...,ive search right word thank breather promis wo...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday


In [15]:
data_sample = dataset[0:20]

ngram_vect =  CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data_sample['message_cleaned'])
print(X_counts.shape)
print(ngram_vect.get_feature_names_out())

(20, 209)
['09061701461 claim' '100 20000' '100000 prize' '11 month' '12 hour'
 '150pday 6day' '16 tsandc' '20000 pound' '2005 text' '21st may'
 '4txtú120 poboxox36504w45wq' '6day 16' '81010 tc' '87077 eg'
 '87077 trywal' '87121 receiv' '87575 cost' '900 prize' 'aft finish'
 'aid patent' 'anymor tonight' 'appli 08452810075over18' 'appli repli'
 'ard smth' 'around though' 'bless time' 'breather promis' 'brother like'
 'call 09061701461' 'call mobil' 'caller press' 'callertun caller'
 'camera free' 'cash 100' 'chanc win' 'claim 81010' 'claim call'
 'claim code' 'click httpwap' 'click wap' 'co free' 'code kl341'
 'colour mobil' 'comp win' 'copi friend' 'cost 150pday' 'credit click'
 'cri enough' 'csh11 send' 'cup final' 'custom select' 'da stock'
 'date sunday' 'dont miss' 'dont think' 'dont want' 'eg england'
 'eh rememb' 'england 87077' 'england macedonia' 'enough today'
 'entitl updat' 'entri questionstd' 'entri wkli' 'even brother' 'fa 87121'
 'fa cup' 'feel way' 'final tkt' 'fine way