# Data Preprocessing

In [1]:
#importing all neccesary libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import and preview datasets
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# check dataset properties
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
# drop the last three axis, rename the first two columns
data = data.filter(["v1","v2"])
data.rename(columns={'v1':'Label','v2':'Message'},inplace=True)
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# download stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agbaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# replace symbols, convert all texts to lowercase, split the texts(tokenize) and stemmize the words(another form of lammetizing)
ps = PorterStemmer() 
content = []
for i in range(0, data.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', data['Message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    content.append(review)
    
    

In [7]:
# create two columns, one containing cleaned message content and the other cleaned content length
data['clean_msg']=np.empty((len(content),1))
for i in range(len(content)):
    data['clean_msg'][i]=content[i]
data['clean_msg_len']=data['clean_msg'].apply(len)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Label,Message,clean_msg,clean_msg_len
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36


In [8]:
# Perform one-hot encoding on label column(ham = 1 and spam = 0)
data=pd.concat([data, pd.get_dummies(data['Label'])], axis=1)
data.drop(['Label'],axis=1,inplace=True)
data.drop(['spam'],axis=1,inplace=True)
data.rename(columns={'ham':'label'},inplace=True)
data.head()

Unnamed: 0,Message,clean_msg,clean_msg_len,label
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76,1
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21,1
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99,0
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35,1
4,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36,1


In [9]:
# Vectorization
# Creating X and Y: Assign clean_msg column to X, label column to Y, perform countvectorization on both X and Y
X=data['clean_msg']
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(content).toarray()

Y = data['label']
Y = np.array(Y)

# preview countvectorization of X and Y
print('X = \n', X, '\nY = \n', Y)

X = 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 
Y = 
 [1 1 0 ... 1 1 1]


# Model Building

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [41]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3, random_state=1)

precisions={}
f1_scores = {}
accuracies ={}

In [20]:
from sklearn.linear_model import LogisticRegression

In [38]:
def train_fit(model,model_name):
    models = model(random_state=1)
    model_fit = models.fit(x_train,y_train)
    pred_model = model_fit.predict(x_test)

    _prec = precision_score(y_test,pred_model)
    _f1 = f1_score(y_test,pred_model)
    _acc = accuracy_score(y_test,pred_model)
    precisions["{} Precision".format(model_name)] = _prec
    f1_scores["{} f1 Score".format(model_name)]=_f1
    accuracies["{} accuracy".format(model_name)]=_acc
    print("F1 Score of {} is : {} \n".format(model_name,_f1),"Accuracy of {} is: {} \n".format(model_name,_acc), "Precision of {} is: {}".format(model_name,_prec))
    
    return _prec,_f1,_acc, pred_model

## Ada boost

In [43]:
from sklearn.ensemble import AdaBoostClassifier
ada_prec,ada_f1,ada_acc,ada_pred=train_fit(AdaBoostClassifier, "Ada Boost")

F1 Score of Ada Boost is : 0.9846678023850085 
 Accuracy of Ada Boost is: 0.9730861244019139 
 Precision of Ada Boost is: 0.9756920999324781


## XgBoost

In [45]:
from xgboost import XGBClassifier
xgb_prec,xgb_f1,xgb_acc,xgb_pred = train_fit(XGBClassifier,"XG Boost")

F1 Score of XG Boost is : 0.9894234049812352 
 Accuracy of XG Boost is: 0.9814593301435407 
 Precision of XG Boost is: 0.9817197020988491


## CatBoost

In [46]:
from catboost import CatBoostClassifier
cat_prec,cat_f1,cat_acc,cat_pred = train_fit(CatBoostClassifier,"CatBoost")

Learning rate set to 0.018421
0:	learn: 0.6707747	total: 285ms	remaining: 4m 44s
1:	learn: 0.6489146	total: 298ms	remaining: 2m 28s
2:	learn: 0.6301760	total: 310ms	remaining: 1m 42s
3:	learn: 0.6120115	total: 321ms	remaining: 1m 19s
4:	learn: 0.5921755	total: 334ms	remaining: 1m 6s
5:	learn: 0.5737187	total: 347ms	remaining: 57.4s
6:	learn: 0.5580802	total: 358ms	remaining: 50.9s
7:	learn: 0.5440548	total: 370ms	remaining: 45.9s
8:	learn: 0.5293128	total: 383ms	remaining: 42.1s
9:	learn: 0.5147388	total: 397ms	remaining: 39.3s
10:	learn: 0.4986351	total: 409ms	remaining: 36.7s
11:	learn: 0.4866808	total: 420ms	remaining: 34.6s
12:	learn: 0.4737143	total: 432ms	remaining: 32.8s
13:	learn: 0.4599686	total: 444ms	remaining: 31.3s
14:	learn: 0.4496341	total: 457ms	remaining: 30s
15:	learn: 0.4394598	total: 469ms	remaining: 28.8s
16:	learn: 0.4290957	total: 481ms	remaining: 27.8s
17:	learn: 0.4181380	total: 498ms	remaining: 27.2s
18:	learn: 0.4077533	total: 512ms	remaining: 26.4s
19:	learn

## Ensemble model building 

In [57]:
#Using the three models above in the voting classifier 
# It gets predictions for each data points then from the three classifiers and chooses the target with the most occurence 

from sklearn.ensemble import VotingClassifier 

ada_m = AdaBoostClassifier(random_state=1)
xgb_m =XGBClassifier(random_state=1)
cat_m =CatBoostClassifier(random_state=1)

ens_m = VotingClassifier(estimators =[("ada",ada_m),("xgb",xgb_m),("cat",cat_m)],voting="hard")
ens_mod = ens_m.fit(x_train,y_train)
ens_pred =ens_mod.predict(x_test)

ens_prec = precision_score(y_test,ens_pred)
ens_f1 = f1_score(y_test,ens_pred)
ens_acc = accuracy_score(y_test,ens_pred)
model_name = "ensemble model"
 

Learning rate set to 0.018421
0:	learn: 0.6707747	total: 43.7ms	remaining: 43.6s
1:	learn: 0.6489146	total: 90.6ms	remaining: 45.2s
2:	learn: 0.6301760	total: 123ms	remaining: 41s
3:	learn: 0.6120115	total: 149ms	remaining: 37s
4:	learn: 0.5921755	total: 167ms	remaining: 33.3s
5:	learn: 0.5737187	total: 188ms	remaining: 31.2s
6:	learn: 0.5580802	total: 205ms	remaining: 29.1s
7:	learn: 0.5440548	total: 225ms	remaining: 27.9s
8:	learn: 0.5293128	total: 242ms	remaining: 26.6s
9:	learn: 0.5147388	total: 261ms	remaining: 25.8s
10:	learn: 0.4986351	total: 275ms	remaining: 24.8s
11:	learn: 0.4866808	total: 292ms	remaining: 24.1s
12:	learn: 0.4737143	total: 307ms	remaining: 23.3s
13:	learn: 0.4599686	total: 324ms	remaining: 22.8s
14:	learn: 0.4496341	total: 341ms	remaining: 22.4s
15:	learn: 0.4394598	total: 364ms	remaining: 22.4s
16:	learn: 0.4290957	total: 382ms	remaining: 22.1s
17:	learn: 0.4181380	total: 396ms	remaining: 21.6s
18:	learn: 0.4077533	total: 411ms	remaining: 21.2s
19:	learn: 0.

In [60]:
precisions["{} Precision".format(model_name)] = ens_prec
f1_scores["{} f1 Score".format(model_name)]=ens_f1
accuracies["{} accuracy".format(model_name)]=ens_acc
print("F1 Score of {} is : {} \n".format(model_name,ens_f1),"Accuracy of {} is: {} \n".format(model_name,ens_acc), "Precision of {} is: {}".format(model_name,ens_prec))
    

F1 Score of ensemble model is : 0.9884275017018379 
 Accuracy of ensemble model is: 0.9796650717703349 
 Precision of ensemble model is: 0.9784366576819407


In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import  RandomForestClassifier

In [62]:
# Doing the same but adding Naive bayes and random forest classifier
nb_m=MultinomialNB()
rnd_m = RandomForestClassifier(random_state=1)

ens_m2 = VotingClassifier(estimators =[("ada",ada_m),("xgb",xgb_m),("cat",cat_m),("nb",nb_m),("rnd",rnd_m)],voting="hard")
ens_mod2 = ens_m2.fit(x_train,y_train)
ens_pred2 =ens_mod2.predict(x_test)
model_name2 = "ensemble model2"

ens_prec2 = precision_score(y_test,ens_pred2)
ens_f12 = f1_score(y_test,ens_pred2)
ens_acc2 = accuracy_score(y_test,ens_pred2)

Learning rate set to 0.018421
0:	learn: 0.6707747	total: 19.5ms	remaining: 19.4s
1:	learn: 0.6489146	total: 34.5ms	remaining: 17.2s
2:	learn: 0.6301760	total: 47.2ms	remaining: 15.7s
3:	learn: 0.6120115	total: 61.6ms	remaining: 15.3s
4:	learn: 0.5921755	total: 75.5ms	remaining: 15s
5:	learn: 0.5737187	total: 87.5ms	remaining: 14.5s
6:	learn: 0.5580802	total: 101ms	remaining: 14.3s
7:	learn: 0.5440548	total: 114ms	remaining: 14.1s
8:	learn: 0.5293128	total: 128ms	remaining: 14.1s
9:	learn: 0.5147388	total: 142ms	remaining: 14.1s
10:	learn: 0.4986351	total: 157ms	remaining: 14.1s
11:	learn: 0.4866808	total: 172ms	remaining: 14.1s
12:	learn: 0.4737143	total: 191ms	remaining: 14.5s
13:	learn: 0.4599686	total: 208ms	remaining: 14.6s
14:	learn: 0.4496341	total: 226ms	remaining: 14.9s
15:	learn: 0.4394598	total: 240ms	remaining: 14.8s
16:	learn: 0.4290957	total: 255ms	remaining: 14.7s
17:	learn: 0.4181380	total: 270ms	remaining: 14.7s
18:	learn: 0.4077533	total: 284ms	remaining: 14.7s
19:	lea

In [63]:
precisions["{} Precision".format(model_name2)] = ens_prec2
f1_scores["{} f1 Score".format(model_name2)]=ens_f12
accuracies["{} accuracy".format(model_name2)]=ens_acc2
print("F1 Score of {} is : {} \n".format(model_name2,ens_f12),"Accuracy of {} is: {} \n".format(model_name2,ens_acc2), "Precision of {} is: {}".format(model_name2,ens_prec2))
    

F1 Score of ensemble model2 is : 0.9914763041254688 
 Accuracy of ensemble model2 is: 0.9850478468899522 
 Precision of ensemble model2 is: 0.9830966869506423


### Metrics

In [67]:
precisions

{'Ada Boost Precision': 0.9756920999324781,
 'XG Boost Precision': 0.9817197020988491,
 'CatBoost Precision': 0.9784656796769852,
 'ensemble model Precision': 0.9784366576819407,
 'ensemble model2 Precision': 0.9830966869506423}

In [65]:
f1_scores

{'Ada Boost f1 Score': 0.9846678023850085,
 'XG Boost f1 Score': 0.9894234049812352,
 'CatBoost f1 Score': 0.9891156462585033,
 'ensemble model f1 Score': 0.9884275017018379,
 'ensemble model2 f1 Score': 0.9914763041254688}

In [66]:
accuracies

{'Ada Boost accuracy': 0.9730861244019139,
 'XG Boost accuracy': 0.9814593301435407,
 'CatBoost accuracy': 0.9808612440191388,
 'ensemble model accuracy': 0.9796650717703349,
 'ensemble model2 accuracy': 0.9850478468899522}