In [19]:
import os
import pandas as pd

In [20]:
file_path = os.path.join(os.getcwd(), 'Dataset/spam.csv')
df = pd.read_csv(file_path, encoding='ISO-8859-1')

In [21]:
df.shape

(5572, 5)

In [22]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [23]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

In [24]:
df.shape

(5572, 2)

In [25]:
df.columns.tolist()

['v1', 'v2']

In [26]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [27]:
df['label_num'] = df.v1.map({'ham':0, 'spam':1})

In [28]:
df.head()

Unnamed: 0,v1,v2,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Data Cleaning and Preprocessing

In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
wl = WordNetLemmatizer()

In [30]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^A-Za-z]', ' ', df['v2'][i])
    review = review.lower()
    review = review.split()
    # review = [ps.stem(i) for i in review if i not in stopwords.words('english')]
    review = [wl.lemmatize(i) for i in review if i not in stopwords.words('english')]
    corpus.append(' '.join(review))
    

In [31]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

## Bag of Words (BoW)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2)) # Max number of frequency 2500 and Makring Binary as True to work as Binary BoW
X = cv.fit_transform(corpus).toarray()

In [33]:
print(X.shape)
(cv.vocabulary_)

(5572, 2500)


{'ok lar': np.int64(1459),
 'free entry': np.int64(646),
 'entry wkly': np.int64(542),
 'wkly comp': np.int64(2433),
 'cup final': np.int64(412),
 'final tkts': np.int64(585),
 'std txt': np.int64(1941),
 'txt rate': np.int64(2198),
 'rate apply': np.int64(1665),
 'think go': np.int64(2041),
 'freemsg hey': np.int64(665),
 'like fun': np.int64(1142),
 'treat like': np.int64(2133),
 'per request': np.int64(1549),
 'request melle': np.int64(1733),
 'melle melle': np.int64(1263),
 'melle oru': np.int64(1264),
 'oru minnaminunginte': np.int64(1498),
 'minnaminunginte nurungu': np.int64(1292),
 'nurungu vettam': np.int64(1439),
 'vettam set': np.int64(2302),
 'set callertune': np.int64(1822),
 'callertune caller': np.int64(193),
 'caller press': np.int64(191),
 'press copy': np.int64(1633),
 'copy friend': np.int64(392),
 'friend callertune': np.int64(668),
 'winner valued': np.int64(2420),
 'valued network': np.int64(2300),
 'network customer': np.int64(1393),
 'reward claim': np.int64(174

## TF - IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(max_features=500)
X = tfv.fit_transform(corpus).toarray()

In [17]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000,
                    formatter=dict(float=lambda X: "%.3g" % X))

In [18]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0.66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.285, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0.244, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.476, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0