# Read in data

In [1]:
import pandas as pd

dataset = pd.read_csv('../Data/SMSSpamCollection.tsv', sep='\t', names=['label', 'message'])
dataset.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


# Function for cleaning data

In [2]:
import string
import re
import nltk

stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text


# Apply CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(dataset['message'])
print(X_counts.shape)
print(count_vect.get_feature_names_out())

(5568, 11519)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [10]:
print(count_vect.get_feature_names_out())

['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


# Apply CountVectorizer on a sample of data

In [11]:
data_sample = dataset[0:20]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['message'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names_out())

(20, 233)
['08002986030' '08452810075over18s' '09061701461' '1' '100' '100000' '11'
 '12' '150pday' '16' '2' '20000' '2005' '21st' '3' '4' '4403LDNW1A7RW18'
 '4txtú120' '6days' '81010' '87077' '87121' '87575' '9' '900' 'A' 'Aft'
 'Ard' 'As' 'CASH' 'CLAIM' 'CSH11' 'Call' 'Callers' 'Callertune' 'Claim'
 'Co' 'Cost' 'Cup' 'DATE' 'ENGLAND' 'Eh' 'England' 'Even' 'FA' 'FREE'
 'Fine' 'Free' 'From' 'HAVE' 'HL' 'Had' 'He' 'I' 'Im' 'Is' 'Ive' 'Jackpot'
 'KL341' 'LCCLTD' 'Macedonia' 'May' 'Melle' 'Minnaminunginte' 'Mobile'
 'Nah' 'No' 'Nurungu' 'ON' 'Oh' 'Oru' 'POBOX' 'POBOXox36504W45WQ' 'Press'
 'Prize' 'R' 'Reply' 'SCOTLAND' 'SIX' 'SUNDAY' 'So' 'TC' 'Text' 'That'
 'The' 'Then' 'They' 'To' 'TryWALES' 'TsandCs' 'Txt' 'U' 'URGENT' 'Update'
 'Valid' 'Vettam' 'WAP' 'WILL' 'WINNER' 'WITH' 'XXXMobileMovieClub' 'Yes'
 'You' 'aids' 'already' 'anymore' 'apply' 'around' 'b' 'blessing'
 'breather' 'brother' 'call' 'callertune' 'camera' 'chances' 'claim'
 'click' 'code' 'colour' 'comin' 'comp' 'copy' 'credi

# Output vectorizer as sparse matrix

In [12]:
X_counts_sample

<20x233 sparse matrix of type '<class 'numpy.int64'>'
	with 259 stored elements in Compressed Sparse Row format>

In [13]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,223,224,225,226,227,228,229,230,231,232
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0


In [14]:
X_counts_df.columns = count_vect_sample.get_feature_names_out()
X_counts_df

Unnamed: 0,08002986030,08452810075over18s,09061701461,1,100,100000,11,12,150pday,16,...,wet,win,wkly,wonderful,wont,word,words,wwwdbuknet,xxxmobilemovieclubcomnQJKGIGHJJGCBL,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0
