In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",encoding = "ISO-8859-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#excluding the NAN cols
cols = data.columns[-3:]
cols

Index(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [4]:
data.drop(cols,axis =1, inplace = True)


In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.columns = ['Conclusion','Messgae']
data.head()

Unnamed: 0,Conclusion,Messgae
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = list(stopwords.words('english'))

def TransformText(text):
    #text to lower case
    text= text.lower()
    #convert to tokens
    tokens= tokenizer.tokenize(text)
    #lemmatize tokens to have meaning of the words
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    #remove words with word size less than 2
    tokens = [token for token in tokens if len(token) > 2]
    #remove stop words
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

In [8]:
data['Message_processed']=data['Messgae'].apply(lambda x : TransformText(x))

In [9]:
data.head()

Unnamed: 0,Conclusion,Messgae,Message_processed
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy.. available bugis great wor...
1,ham,Ok lar... Joking wif u oni...,lar ... joking wif oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts 21st m...
3,ham,U dun say so early hor... U c already then say...,dun say early hor ... already say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah n't think usf life around though


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# parameters max_df : to reduce words which occur frequently , here more than 70 percent 
# parameters min_df : to reduce rare words, spelling mistake etc
# ngrams is taken as 2 : taking combination of words
vectorizer = CountVectorizer(min_df = 25, max_df= 0.7, ngram_range= (1,2))
features = vectorizer.fit_transform(data['Message_processed']).todense()
features.shape
#print(features)
#pd.DataFrame(features, columns = vectorizer.get_feature_names_out())


(5572, 362)

In [11]:
# normalise the features

features =  features / features.sum(axis =1)
features = pd.DataFrame(features)

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,352,353,354,355,356,357,358,359,360,361
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,,,,,,,,,,,...,,,,,,,,,,
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y = data['Conclusion']

In [14]:
# checking the nulls
# taking only the indexes where there is no null 
idxs = np.where(~features.isnull()[0])[0]

In [15]:
idxs

array([   0,    1,    2, ..., 5568, 5570, 5571])

In [16]:
features = features.iloc[idxs, :]
y= y.iloc[idxs]

features.shape, y.shape

((5203, 362), (5203,))

In [17]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest , Ytrain , Ytest = train_test_split(features, y , test_size = 0.3 , random_state = 0)

Xtrain.shape, Xtest.shape

((3642, 362), (1561, 362))

In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(Xtrain,Ytrain)

RandomForestClassifier()

In [19]:
model.score(Xtest,Ytest )

0.9698910954516335

In [20]:
from sklearn.svm import SVC
model = SVC()
model.fit(Xtrain,Ytrain)

SVC()

In [21]:
model.score(Xtest,Ytest )

0.9762972453555413