In [10]:
import pandas as pd
import re 
import string
import nltk

pd.set_option('display.max_colwidth', 200)

data = pd.read_csv("../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()  

data.head()

Unnamed: 0,label,Content
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [35]:
def clean_email(email):
    result = ''.join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = [word for word in tokens if word not in en_stopwords]
    return text

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vectorisation_full = CountVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])
print(vect_final.shape)

(5572, 11525)


In [24]:
print(vectorisation_full.get_feature_names_out())

['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [25]:
data0 = data[0:5]

vectorisation_0 = CountVectorizer(analyzer=clean_email)
vect_final_0 = vectorisation_0.fit_transform(data0['Content'])
print(vect_final_0.shape)

(5, 59)


In [26]:
print(vectorisation_0.get_feature_names_out())

['08452810075over18s' '2' '2005' '21st' '87121' 'Available' 'Cine' 'Cup'
 'FA' 'Free' 'Go' 'I' 'Joking' 'May' 'Nah' 'Ok' 'Text' 'U' 'already'
 'amore' 'apply' 'around' 'buffet' 'bugis' 'c' 'comp' 'crazy' 'dont' 'dun'
 'e' 'early' 'entry' 'final' 'goes' 'got' 'great' 'hor' 'jurong' 'la'
 'lar' 'lives' 'n' 'oni' 'point' 'questionstd' 'rateTCs' 'receive' 'say'
 'think' 'though' 'tkts' 'txt' 'u' 'usf' 'wat' 'wif' 'win' 'wkly' 'world']


In [27]:
vect_final_0

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59 stored elements and shape (5, 59)>

In [29]:
vect_final_0.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 

In [31]:
df_vect_final_0 = pd.DataFrame(vect_final_0.toarray())
df_vect_final_0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,1,1,1,1,1,0,0,1,2,1,...,0,1,1,0,0,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [32]:
vect_final.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
x = pd.DataFrame(vect_final.toarray())
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11515,11516,11517,11518,11519,11520,11521,11522,11523,11524
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
