In [3]:
import os
import pandas as pd

In [7]:
file_path = os.path.join(os.getcwd(), 'Dataset/spam.csv')
df = pd.read_csv(file_path, encoding='ISO-8859-1')

In [26]:
df.shape

(5572, 5)

In [27]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [28]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

In [30]:
df.shape

(5572, 2)

In [46]:
df.columns.tolist()

['v1', 'v2']

In [51]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [52]:
df['label_num'] = df.v1.map({'ham':0, 'spam':1})

In [54]:
df.head()

Unnamed: 0,v1,v2,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Data Cleaning and Preprocessing

In [32]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [34]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^A-Za-z]', ' ', df['v2'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(i) for i in review if i not in stopwords.words('english')]
    corpus.append(' '.join(review))
    

In [37]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

## Bag of Words (BoW)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2)) # Max number of frequency 2500 and Makring Binary as True to work as Binary BoW
X = cv.fit_transform(corpus).toarray()

In [60]:
print(X.shape)
(cv.vocabulary_)

(5572, 2500)


{'ok lar': np.int64(1485),
 'free entri': np.int64(622),
 'entri wkli': np.int64(538),
 'wkli comp': np.int64(2426),
 'cup final': np.int64(374),
 'std txt': np.int64(1967),
 'txt rate': np.int64(2198),
 'rate appli': np.int64(1676),
 'freemsg hey': np.int64(652),
 'like fun': np.int64(1155),
 'per request': np.int64(1541),
 'request mell': np.int64(1732),
 'mell mell': np.int64(1286),
 'mell oru': np.int64(1287),
 'oru minnaminungint': np.int64(1518),
 'minnaminungint nurungu': np.int64(1315),
 'nurungu vettam': np.int64(1464),
 'vettam set': np.int64(2288),
 'set callertun': np.int64(1850),
 'callertun caller': np.int64(204),
 'caller press': np.int64(202),
 'press copi': np.int64(1643),
 'copi friend': np.int64(339),
 'friend callertun': np.int64(656),
 'winner valu': np.int64(2414),
 'valu network': np.int64(2286),
 'network custom': np.int64(1417),
 'custom select': np.int64(388),
 'select receivea': np.int64(1808),
 'reward claim': np.int64(1745),
 'claim call': np.int64(258),
 '

## N-grams