In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Wordcloud for text visualization
from wordcloud import wordcloud

#Improting NLTK for text processing
import nltk
from nltk.corpus import stopwords

# Downloading the NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True, axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Preprocessing

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Check the duplicates

df.duplicated().sum()

403

In [17]:
len(df)

5572

In [18]:
df = df.drop_duplicates(keep='first')
len(df)

5169

# Feature Engineering

In [19]:
# Importing porter stemmer for text stemming

from nltk.stem import PorterStemmer

# Importing string module for handling special characters
import string

ps = PorterStemmer()

In [20]:
# Lower case transformation and text processing function

def transform_text(text):

    # Lowercase the text
    text = text.lower()

    # Tokenize the text
    text = nltk.word_tokenize(text)

    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    # Removing stop words and punctuations
    text = y[:]
    y.clear()

    # Loop through the text and stem the words
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    # Stemming using porter stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    # Join the processed words
    return " ".join(y)
    

In [21]:
transform_text("Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...")

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [22]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfid = TfidfVectorizer(max_features = 500)

In [24]:
x = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

# Train Test Split

In [31]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=2)

# Model Training

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [27]:
svc = SVC(kernel = 'sigmoid', gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50, random_state=2)

In [35]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
}

# Model Evaluation

In [69]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, x_train, y_train, x_test, y_test):
    clfs.fit(x_train, y_train)
    y_pred = clfs.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    return acc, prec

In [72]:
print(type(clfs))

clfs = {
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'MultinomialNB': MultinomialNB(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'BaggingClassifier': BaggingClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),

    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'XGBClassifier': XGBClassifier()

}

<class 'dict'>


In [73]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
    accuracy, current_precision = train_classifier(clf, x_train, y_train, x_test, y_test)
    
    print()
    print(f"For: {name}")
    print("Accuracy:", accuracy)
    print("Precision:", current_precision)

    accuracy_scores.append(accuracy)
    precision_scores.append(current_precision)


For: SVC
Accuracy: 0.9767981438515081
Precision: 0.9854014598540146

For: RandomForestClassifier
Accuracy: 0.9752513534416086
Precision: 0.9645390070921985

For: LogisticRegression
Accuracy: 0.9667440061871616
Precision: 0.96875

For: KNeighborsClassifier
Accuracy: 0.9265274555297757
Precision: 0.9857142857142858

For: DecisionTreeClassifier
Accuracy: 0.9566898685228151
Precision: 0.8407643312101911

For: MultinomialNB
Accuracy: 0.97138437741686
Precision: 0.9632352941176471





For: AdaBoostClassifier
Accuracy: 0.9590100541376644
Precision: 0.9435483870967742

For: BaggingClassifier
Accuracy: 0.9590100541376644
Precision: 0.8819444444444444

For: GradientBoostingClassifier
Accuracy: 0.9613302397525135
Precision: 0.9448818897637795

For: ExtraTreesClassifier
Accuracy: 0.9783449342614076
Precision: 0.9655172413793104

For: XGBClassifier
Accuracy: 0.9682907965970611
Precision: 0.9552238805970149
