In [13]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv('E:/clients/terry/vetorized_spam.csv')
data.head()

Unnamed: 0,feat1,feat2,spam
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [3]:
data.dropna(inplace=True, axis=0)

In [4]:
x=data[['feat1','feat2']]
y=data.spam
print(x.shape)
print(y.shape)

(5664, 2)
(5664,)


## Split data into train and test , and shuffle 

In [5]:
x_train,x_test, y_train,y_test= train_test_split(x,y,random_state=104,test_size=0.20,shuffle=True)
print(x_train.shape)
print(y_train.shape)

(4531, 2)
(4531,)


## Define 3 classifiers 

### TF-DIF model

In [6]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Multinomial Bayes Classifier 

In [16]:
bayes_classifier=MultinomialNB()
#bayes_classifier.fit(x_train,y_train)


In [8]:
y_pred=bayes_classifier.predict(x_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy: 0.7537511032656664
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       854
         1.0       0.00      0.00      0.00       279

    accuracy                           0.75      1133
   macro avg       0.38      0.50      0.43      1133
weighted avg       0.57      0.75      0.65      1133

[[854   0]
 [279   0]]


#### Decision tree

In [9]:
# Decision tree with gini
clf = DecisionTreeClassifier(criterion = "gini",
            random_state = 123,max_depth=4)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.7537511032656664

## 5-Fold cross validation 

#### Decision treen  with 5-folds cross validation  

In [26]:
#Entropy Model
tree_clf = DecisionTreeClassifier(criterion = 'entropy',max_depth=4, random_state = 0)
tree_clf.fit(x_train, y_train)

#Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = tree_clf, X = x_train, y = y_train, cv = 5)
mean_dt_e=accuracies.mean()
std_dt_e=accuracies.std()

#After using 5 fold cross validation
print('After 5 fold cross validation:')
print('Mean of Accuracies: ',mean_dt_e*100,end='\n')
print('Standard deviation of Accuracies',std_dt_e*100,end='\n')

After 5 fold cross validation:
Mean of Accuracies:  75.96557070223987
Standard deviation of Accuracies 0.04275028236622681


### Multinomial bayes with 5-fold cross validation 

In [27]:
#Entropy Model
bayes_clf = MultinomialNB()
bayes_clf.fit(x_train, y_train)

#Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = bayes_clf, X = x_train, y = y_train, cv = 5)
mean_dt_e=accuracies.mean()
std_dt_e=accuracies.std()

#After using 5 fold cross validation
print('After 5 fold cross validation:')
print('Mean of Accuracies: ',mean_dt_e*100,end='\n')
print('Standard deviation of Accuracies',std_dt_e*100,end='\n')


After 5 fold cross validation:
Mean of Accuracies:  75.96557070223987
Standard deviation of Accuracies 0.04275028236622681


Decison tree models show equal performance, therefore we could use either of the models as our final model. The standard deviation of accuracies for the models is low and we conclude that the performance on unseen data is good. 

## Use entire train data 

### decision tree on full data

In [23]:
# Decision tree with gini
tree = DecisionTreeClassifier(criterion = "entropy",
            random_state = 123,max_depth=4)
tree.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123)

### naive bayes on full data

In [21]:
naive = MultinomialNB()
naive.fit(x_train, y_train)

MultinomialNB()

## Estimations 

### decision tree

In [24]:
#predict y
y_pred = tree.predict(x_test)

#Confusion Matrix
print('Test Output:')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test,y_pred))

Test Output:
Confusion Matrix:
[[854   0]
 [279   0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       854
         1.0       0.00      0.00      0.00       279

    accuracy                           0.75      1133
   macro avg       0.38      0.50      0.43      1133
weighted avg       0.57      0.75      0.65      1133

Accuracy:  0.7537511032656664


### Naive bayes report

In [25]:
#predict y
y_pred = naive.predict(x_test)

#Confusion Matrix
print('Test Output:')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Accuracy: ',accuracy_score(y_test,y_pred))

Test Output:
Confusion Matrix:
[[854   0]
 [279   0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.75      1.00      0.86       854
         1.0       0.00      0.00      0.00       279

    accuracy                           0.75      1133
   macro avg       0.38      0.50      0.43      1133
weighted avg       0.57      0.75      0.65      1133

Accuracy:  0.7537511032656664


## comment on the model performance

The decision tree model and Multinomial bayes models show equal performance on the data set. Since their accuracies on unseen does not show big variance, any of the model could be used for spam detection. 

## Reflection 
The efficiency of this spam classifier could be more effective if an automated data cleaning pipeline was put in place and hosted in a cloud platform for instance heroku. A new email would trigger the data pipeline, the cleaned email is them passed to the classifier in real-time or near real-time and output sent to the appropriate category. Apart from, multinomial naive bayes and tree classifiers, boosted classifiers would improve the effectiveness of this solution. 