In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hemanthanne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hemanthanne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('spam.csv')
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Few primary pre-processing steps on the data.

In [3]:
df = df.iloc[:, :-3]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df = df.rename(columns=
               {'v1': 'target_label',
                'v2': 'text'
                })
df.head()

Unnamed: 0,target_label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


- Encoding target using LabelEncoder

In [5]:
from sklearn.preprocessing import LabelEncoder

lbl_enc = LabelEncoder()

df['target_label'] = lbl_enc.fit_transform(df['target_label'])

df.head()

Unnamed: 0,target_label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.duplicated().sum()

403

In [7]:
df = df.drop_duplicates(keep='first')

In [8]:
df.duplicated().sum()

0

- Removing digits and whitespaces

In [9]:
def clean(text):
    text = re.sub(r'\d+','',text)
    text=text.lower()
    text = re.sub(r'\s+',' ',text)
    return text


df['text'] = df['text'].apply(clean)

#### Tokenizing and Stemming (using PortStemmer) input text.

In [10]:
from nltk.stem.porter import PorterStemmer
import string

po_st = PorterStemmer()

def transform_text(text):
    """ Removes stopwords, punctuation and special characters. """
    stop_words = set(stopwords.words('english'))
    text = nltk.word_tokenize(text)

    x=[]
    for i in text:
        if i.isalnum() and i not in stop_words and i not in string.punctuation:
            x.append(i)

    text = x[:]
    x.clear()

    text = [po_st.stem(word) for word in text]

    return " ".join(text)


    

In [11]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target_label,text,transformed_text
0,0,"go until jurong point, crazy.. available only ...",go jurong point crazi avail bugi n great world...
1,0,ok lar... joking wif u oni...,ok lar joke wif u oni
2,1,free entry in a wkly comp to win fa cup final ...,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say so early hor... u c already then say...,u dun say earli hor u c alreadi say
4,0,"nah i don't think he goes to usf, he lives aro...",nah think goe usf live around though


#### Converting the input text to vectors with tfidf transformation.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=500)

X = tfidf.fit_transform(df['transformed_text'])
# X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target_label']



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2, stratify=y)

#### Training 4 Models to find the best performing based on accuracy.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [15]:
lr = LogisticRegression(solver = 'liblinear', penalty='l1')
dtc = DecisionTreeClassifier(max_depth=5)
adab = AdaBoostClassifier(n_estimators=50, random_state= 2)
rfc = RandomForestClassifier(n_estimators= 50, random_state=2)

In [None]:
classifiers = {
    'LR': lr,
    'DTC': dtc,
    'ADAB': adab,
    'RFC': rfc
}

from sklearn.metrics import accuracy_score, precision_score
def training(classifiers, X_train , X_test, y_train, y_test):
    for name, classifiers in classifiers.items():
        classifiers.fit(X_train, y_train)
        y_pred = classifiers.predict(X_test)
        print(f"Accuracy for {name}:{accuracy_score(y_test, y_pred)} ")
        print(f"Precision for {name}:{precision_score(y_test, y_pred)} \n")

    return None

In [32]:
training(classifiers,X_train , X_test, y_train, y_test)

Accuracy for LR:0.9748549323017408 
Precision for LR:0.9411764705882353 

Accuracy for DTC:0.9352030947775629 
Precision for DTC:0.82 

Accuracy for ADAB:0.960348162475822 
Precision for ADAB:0.8813559322033898 

Accuracy for RFC:0.9845261121856866 
Precision for RFC:0.9914529914529915 



In [None]:
# Random Forest Classifier (rfc) is the best performer with high accuracy.


sentence = "Congratulations! You’ve won a $1000 gift card — click here to claim your prize now!"
sentence = clean(sentence)
sentence = transform_text(sentence)
sent = tfidf.transform([sentence]) # Because it expects a list of strings

pred = rfc.predict(sent)
print(f"The probability of the sentence being a spam is :{pred}")

The probability of the sentence being a spam is :[1]
