### Basic Imports

In [2]:
## Importing necessary libraries
import numpy as np  ## For numerical operations
import pandas as pd  ## For data manipulation and analysis
import matplotlib.pyplot as plt   ## For data visualization
%matplotlib inline

## Importing wordCloud for text visualization
from wordcloud import WordCloud

## Importing spacy for NLP
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
## Read the csv file
df = pd.read_csv('spam.csv')

## Display the first few rows of the dataframe
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Rename the column names
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Processing

In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
## Check duplicate values
df.duplicated().sum()

np.int64(403)

In [10]:
len(df)

5572

In [11]:
df = df.drop_duplicates(keep = 'first')
len(df)

5169

### Feature Engineering

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
## lowercase transformation and text preprocessing function

In [14]:
import re

In [20]:
def initial_text_transformation(series):
    series = series.str.lower()
    ## Keeping only the characters, removing the special characters, punctuations etc
    series = series.replace(r'[^A-Za-z\s]', '', regex = True)
    ## Normalize space
    series = series.replace(r'\s+', ' ', regex = True)
    ## Remove any trailling spaces
    series = series.str.strip()

    return series

In [21]:
def removing_stop_words(text):
    doc = nlp(text)
    words = [token.lemma_ for token in doc if not token.is_stop]
    return " ".join(words)

In [22]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
def transform_text(series):
    series = initial_text_transformation(series)
    series = series.apply(removing_stop_words)
    return series

In [27]:
df['transform_text'] = transform_text(df['text'])

In [28]:
df.head()

Unnamed: 0,target,text,transform_text
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah not think go usf live


In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 500)

In [30]:
X = tfidf.fit_transform(df['transform_text']).toarray()
y = df['target'].values

### Train Test Split

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Model Training

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [34]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [35]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

### Model Evaluation

In [36]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [None]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9632495164410058
Precision:  0.9495798319327731

For:  KNN
Accuracy:  0.9313346228239845
Precision:  0.9868421052631579

For:  NB
Accuracy:  0.9661508704061895
Precision:  0.9741379310344828

For:  DT
Accuracy:  0.9158607350096711
Precision:  0.7959183673469388

For:  LR
Accuracy:  0.9642166344294004
Precision:  0.9576271186440678

For:  RF
Accuracy:  0.9671179883945842
Precision:  0.944

For:  Adaboost
Accuracy:  0.8984526112185687
Precision:  0.9545454545454546

For:  Bgc
Accuracy:  0.9584139264990329
Precision:  0.8591549295774648
