In [80]:
import pandas as pd 

In [81]:
spam_df = pd.read_csv("Data\SMSSpamCollection.csv",sep='\t',header=None,names=['Label', 'SMS'])

In [82]:
spam_df.shape

(5572, 2)

In [83]:
spam_df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [84]:
spam_df["Label"].unique()

array(['ham', 'spam'], dtype=object)

In [85]:
spam_df["Label"].replace("ham","non-spam",inplace=True)

In [86]:
spam_df.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,non-spam,"Sorry, I'll call later"
freq,4825,30


In [87]:
randomized_df = spam_df.sample(frac=1,random_state=1)
training_index = round(randomized_df.shape[0] * 0.8)
training_df = randomized_df[:training_index].reset_index(drop=True)
test_df = randomized_df[training_index:].reset_index(drop=True)

In [88]:
#clean SMS column
training_df["SMS"].replace("\W"," ",regex=True,inplace=True)
training_df["SMS"] = training_df["SMS"].str.lower()

In [89]:
training_df.head()

Unnamed: 0,Label,SMS
0,non-spam,yep by the pretty sculpture
1,non-spam,yes princess are you going to make me moan
2,non-spam,welp apparently he retired
3,non-spam,havent
4,non-spam,i forgot 2 ask ü all smth there s a card on ...


In [90]:
training_df["SMS list"] = training_df["SMS"].str.split()

In [91]:
vocabulary =[]
for row in training_df["SMS list"]:
    for element in row:
        vocabulary.append(element)


In [92]:
vocabulary = list(set(vocabulary))

In [94]:
word_counts_per_sms = {unique_word: [0] * len(training_df['SMS list']) for unique_word in vocabulary}
for index, sms in enumerate(training_df['SMS list']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [97]:
word_df = pd.DataFrame(word_counts_per_sms)
word_df.head()

Unnamed: 0,choice,english,konw,09064012103,non,urgh,woken,spoiled,wiskey,port,...,swtheart,joy,buy,teachers,hearin,aiyo,083,embarassed,heaven,vasai
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
merged_df = pd.concat([training_df, word_df], axis=1)

In [99]:
merged_df

Unnamed: 0,Label,SMS,SMS list,choice,english,konw,09064012103,non,urgh,woken,...,swtheart,joy,buy,teachers,hearin,aiyo,083,embarassed,heaven,vasai
0,non-spam,yep by the pretty sculpture,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,non-spam,yes princess are you going to make me moan,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,non-spam,welp apparently he retired,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,non-spam,havent,[havent],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,non-spam,i forgot 2 ask ü all smth there s a card on ...,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,non-spam,sorry i ll call later in meeting any thing re...,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,non-spam,babe i fucking love you too you know fuck...,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,u ve been selected to stay in 1 of 250 top bri...,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,non-spam,hello my boytoy geeee i miss you already a...,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
