In [65]:
import pandas as pd
import re
import string
import nltk

pd.set_option('display.max_colwidth', 100)

data = pd.read_csv('../Data/SMSSpamCollection.txt', sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data.head()

Unnamed: 0,label,Content
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [66]:
def clean_email(email):
    result = ''.join([word for word in email if word not in string.punctuation])
    tokens = re.split(r'\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorisation = TfidfVectorizer(analyzer=clean_email)
tfidf_final = tfidf_vectorisation.fit_transform(data['Content'])
print(tfidf_final.shape)

(5572, 8193)


In [69]:
print(tfidf_vectorisation.get_feature_names_out())

['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [70]:
data0 = data[0:5]

tfidf_vectorisation0 = TfidfVectorizer(analyzer=clean_email)
tfidf_final0 = tfidf_vectorisation0.fit_transform(data0['Content'])
print(tfidf_final0.shape)

(5, 58)


In [71]:
print(tfidf_vectorisation0.get_feature_names_out())

['08452810075over18' '2' '2005' '21st' '87121' 'alreadi' 'amor' 'appli'
 'around' 'avail' 'buffet' 'bugi' 'c' 'cine' 'comp' 'crazi' 'cup' 'dont'
 'dun' 'e' 'earli' 'entri' 'fa' 'final' 'free' 'go' 'goe' 'got' 'great'
 'hor' 'i' 'joke' 'jurong' 'la' 'lar' 'live' 'may' 'n' 'nah' 'ok' 'oni'
 'point' 'questionstd' 'ratetc' 'receiv' 'say' 'text' 'think' 'though'
 'tkt' 'txt' 'u' 'usf' 'wat' 'wif' 'win' 'wkli' 'world']


In [72]:
df_tfidf_final0 = pd.DataFrame(tfidf_final0.toarray())
df_tfidf_final0.columns = tfidf_vectorisation0.get_feature_names_out()
df_tfidf_final0

Unnamed: 0,08452810075over18,2,2005,21st,87121,alreadi,amor,appli,around,avail,...,though,tkt,txt,u,usf,wat,wif,win,wkli,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.339393,0.0,0.0,0.420669,0.0,0.0,0.0
2,0.19245,0.19245,0.19245,0.19245,0.19245,0.0,0.0,0.19245,0.0,0.0,...,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0
3,0.0,0.0,0.0,0.0,0.0,0.293564,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.473691,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
