# Naive Bayes with Structure Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score

In [2]:
StructuralInfo=pd.read_csv("email_05_structure.csv",index_col=[0])
StructuralInfo.describe()

Unnamed: 0,multipart,html,links,attachments,spam
count,85203.0,85203.0,85203.0,85203.0,85203.0
mean,0.29006,0.315411,0.09103,0.020833,0.552704
std,0.453793,0.464682,0.287653,0.142825,0.497217
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0


In [3]:
# Shuffle=True
x_train,x_test,y_train,y_test=train_test_split(StructuralInfo[["multipart","html","links","attachments"]],StructuralInfo["spam"],shuffle=True,random_state=42,test_size=0.15)

In [7]:
# Select the best hyper param alpha
alphalst=[0.1,1,10,100,1000,2000,5000,10000]
auclst=[]
accuracylst=[]
for a in alphalst:
    model=MultinomialNB(alpha=a)
    cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True)
    auclst.append(cross_result['test_roc_auc'].mean())
    accuracylst.append(cross_result['test_accuracy'].mean())
    
print(auclst)
print("The best alpha is",alphalst[np.argmax(auclst)])
print("Accuracy with best auc", accuracylst[np.argmax(auclst)])

[0.718608227103376, 0.7185839725427419, 0.7187484177551858, 0.760741873081608, 0.7550991100532989, 0.7550991100532989, 0.7543732793498139, 0.753321292943202]
The best alpha is 100
Accuracy with best auc 0.5722432323165305


In [11]:
print("The 5-fold auc of NB is",np.max(auclst))
print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])

model=MultinomialNB(alpha=np.argmax(auclst))
model.fit(x_train,y_train)
Y_pred=model.predict_proba(x_test)
print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))

The 5-fold auc of NB is 0.760741873081608
The 5-fold accuracy of NB is 0.5722432323165305
Accuracy on test set is 0.5760112667240435
auc on test set is 0.7220113276110285


# Subject model

In [13]:
import pickle
import sys 
sys.path.append("../../")
from E4525_ML import text

In [14]:
def subject_model(feature="set"):
    data_dir = "subject_feature"

    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    X_set=pickle.load(open( set_features_filename, "rb" ) )
    labels=pickle.load(open( labels_filename, "rb" ) )
    
    x_train,x_test,y_train,y_test=train_test_split(X_set,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    auclst=[]
    accuracylst=[]
    for a in alphalst:
        model=MultinomialNB(alpha=a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best alpha is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc of NB is",np.max(auclst))
    print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])
    
    model=MultinomialNB(alpha=alphalst[np.argmax(auclst)])
    model.fit(x_train,y_train)
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test
    
print("----------------------------------------------------------------")
subject_model(feature="set")
print("----------------------------------------------------------------")
subject_model(feature="count")
print("----------------------------------------------------------------")
subject_model(feature="tfidf")
print("----------------------------------------------------------------")

----------------------------------------------------------------
set feature: 
[0.9810856485929806, 0.9816122907199849, 0.9817147393338596, 0.9783004250875891, 0.9693572686848277, 0.9513206042821425, 0.9328336217898358]
The best alpha is 0.1
The 5-fold auc of NB is 0.9817147393338596
The 5-fold accuracy of NB is 0.9135622667489163
auc on test set is 0.9847586092566298
Accuracy on test set is 0.9220718253657774
----------------------------------------------------------------
count feature: 
[0.9808092290035428, 0.9812768337227296, 0.9811498872180365, 0.977496247835927, 0.9683614896493522, 0.9498353382561708, 0.9311512917738287]
The best alpha is 0.001
The 5-fold auc of NB is 0.9812768337227296
The 5-fold accuracy of NB is 0.9136727277511033
auc on test set is 0.9846000581093861
Accuracy on test set is 0.9214458962522495
----------------------------------------------------------------
tfidf feature: 
[0.9816029535441164, 0.9823281792538315, 0.9833458708190094, 0.9798705210930401, 0.96959

The best feature is tf-idf feature, which achieves 5-fold auc $0.983$ and accuracy $0.919$ on validation set and $0.927$ on test set

# Message body model

In [15]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import train_test_split, cross_validate

import pickle
import sys 
sys.path.append("../../")
from E4525_ML import text

In [16]:
def message_model(feature="set",n=-1,model_type="MB"):
    data_dir = "message_feature"

    set_features_filename=     data_dir+"/{}_features_{}.p".format(feature,-1)
    labels_filename=     data_dir+"/labels_{}.p".format(-1)

    X_set=pickle.load(open( set_features_filename, "rb" ) )
    labels=pickle.load(open( labels_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set=X_set[:n]
    
    x_train,x_test,y_train,y_test=train_test_split(X_set,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    auclst=[]
    accuracylst=[]
    for a in alphalst:
        if model_type=="MB":
            model=MultinomialNB(alpha=a)
        else: 
            model=ComplementNB(alpha=a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best alpha is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc of NB is",np.max(auclst))
    print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])
    
    if model_type=="MB":
        model=MultinomialNB(alpha=alphalst[np.argmax(auclst)])
    else: 
        model=ComplementNB(alpha=alphalst[np.argmax(auclst)])
    
    model.fit(x_train,y_train)
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test
    
print("----------------------------------------------------------------")
message_model(feature="set",n=-1)
print("----------------------------------------------------------------")
message_model(feature="count",n=-1)
print("----------------------------------------------------------------")
message_model(feature="tfidf",n=-1)
print("----------------------------------------------------------------")

# print("----------------------------------------------------------------")
# message_model(feature="set",n=-1,model_type="C")
# print("----------------------------------------------------------------")
# message_model(feature="count",n=-1,model_type="C")
# print("----------------------------------------------------------------")
# message_model(feature="tfidf",n=-1,model_type="C")
# print("----------------------------------------------------------------")

----------------------------------------------------------------
set feature: 
[0.955784011939319, 0.9568049456963947, 0.9591795743011635, 0.9591151180480928, 0.9561650113596499, 0.937109939701348, 0.9104275272010037]
The best alpha is 0.1
The 5-fold auc of NB is 0.9591795743011635
The 5-fold accuracy of NB is 0.9320510566359299
auc on test set is 0.9616612936890916
Accuracy on test set is 0.9350598544714811
----------------------------------------------------------------
count feature: 
[0.9556406441423839, 0.9565569397574025, 0.9571296276895558, 0.9546999303157468, 0.9499863377864347, 0.9388687927155273, 0.910942509015902]
The best alpha is 0.1
The 5-fold auc of NB is 0.9571296276895558
The 5-fold accuracy of NB is 0.9170418035703782
auc on test set is 0.9595320915010088
Accuracy on test set is 0.9213676551130584
----------------------------------------------------------------
tfidf feature: 
[0.9625875419291923, 0.9633831935081518, 0.9640269845318444, 0.9608452690758984, 0.951290165

# Combined model (Combined features)

In [4]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score

import pickle
import sys 
sys.path.append("../../")
from E4525_ML import text

In [13]:
from scipy.sparse import csr_matrix,hstack

def combined_model(feature="set",n=-1,model_type="MB"):
    
    # load message body features
    data_dir = "message_feature"

    set_features_filename=     data_dir+"/{}_features_{}.p".format(feature,-1)
    labels_filename=     data_dir+"/labels_{}.p".format(-1)

    X_set_body=pickle.load(open( set_features_filename, "rb" ) )
    labels=pickle.load(open( labels_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_body=X_set_body[:n]
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    # setVectorizer_subject=pickle.load(open( set_vectorizer_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        X_set_subject=X_set_subject[:n]
    
    # load structural info features
    StructuralInfo=pd.read_csv("email_05_structure.csv",index_col=[0])
    StructuralInfo=StructuralInfo.iloc[:,:-1]
    if n!=-1:
        StructuralInfo=StructuralInfo.values[:n]
    
    StructuralInfo=csr_matrix(StructuralInfo)
    
    # print(StructuralInfo.shape,X_set_subject.shape,X_set_body.shape)
    
    # Combine features together
    X_set=hstack([X_set_body,X_set_subject,StructuralInfo])
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    auclst=[]
    accuracylst=[]
    for a in alphalst:
        if model_type=="MB":
            model=MultinomialNB(alpha=a)
        else: 
            model=ComplementNB(alpha=a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best alpha is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc of NB is",np.max(auclst))
    print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])
    
    if model_type=="MB":
        model=MultinomialNB(alpha=alphalst[np.argmax(auclst)])
    else: 
        model=ComplementNB(alpha=alphalst[np.argmax(auclst)])
    
    model.fit(x_train,y_train)
    with open("./models/CombinedFeatNB_{}.p".format(feature),"wb") as f:
        pickle.dump(model,f)
    
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
combined_model(feature="set",n=-1)
print("----------------------------------------------------------------")
combined_model(feature="count",n=-1)
print("----------------------------------------------------------------")
combined_model(feature="tfidf",n=-1)
print("----------------------------------------------------------------")

# print("----------------------------------------------------------------")
# combined_model(feature="set",n=-1,model_type="C")
# print("----------------------------------------------------------------")
# combined_model(feature="count",n=-1,model_type="C")
# print("----------------------------------------------------------------")
# combined_model(feature="tfidf",n=-1,model_type="C")
# print("----------------------------------------------------------------")

----------------------------------------------------------------
set feature: 
[0.9723815903506662, 0.9729562049451216, 0.9732859178771148, 0.9722975916478127, 0.9700469652884589, 0.955992611069787, 0.9366076400534583]
The best alpha is 0.1
The 5-fold auc of NB is 0.9732859178771148
The 5-fold accuracy of NB is 0.9590179920242417
auc on test set is 0.9749770113853832
Accuracy on test set is 0.9602535012909788
----------------------------------------------------------------
count feature: 
[0.9697414003905, 0.9699105986705948, 0.9689580569853729, 0.9662672532362905, 0.9624481506880231, 0.9537826237478744, 0.93299594675701]
The best alpha is 0.001
The 5-fold auc of NB is 0.9699105986705948
The 5-fold accuracy of NB is 0.9575405224976803
auc on test set is 0.9719918635949854
Accuracy on test set is 0.9596275721774509
----------------------------------------------------------------
tfidf feature: 
[0.9944147086455117, 0.995114061345826, 0.9957546133582774, 0.9948775522344503, 0.98881342002

# Combined model (Combined classifier)

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import roc_auc_score

import pickle
import sys 
sys.path.append("../../")
from E4525_ML import text

In [2]:
from scipy.sparse import csr_matrix,hstack

class CombinedNB(BaseEstimator):
    def __init__(self,n1,n2,n3,a):
        self.n1=n1
        self.n2=n2
        self.n3=n3
        self.a=a
    
    def fit(self,X,y):
        self.model1=MultinomialNB(alpha=self.a)
        self.model1.fit(X[:,:self.n1],y)
        self.class_prior_=self.model1.class_count_/self.model1.class_count_.sum()
        
        self.model2=MultinomialNB(alpha=self.a)
        self.model2.fit(X[:,self.n1:(self.n1+self.n2)],y)
        self.model3=MultinomialNB(alpha=self.a)
        self.model3.fit(X[:,(self.n1+self.n2):(self.n1+self.n2+self.n3)],y)
        
    def predict_proba(self,X):
        p1=self.model1.predict_proba(X[:,:self.n1])
        p2=self.model2.predict_proba(X[:,self.n1:(self.n1+self.n2)])
        p3=self.model3.predict_proba(X[:,(self.n1+self.n2):(self.n1+self.n2+self.n3)])
        p=p1*p2*p3/(self.class_prior_)
        return p/(p.sum(axis=1)[:,np.newaxis])
    
    def predict(self,X):
        return np.argmax(self.predict_proba(X),axis=1)
    

def combined_model2(feature="set",n=-1):
    
    # load message body features
    data_dir = "message_feature"

    set_features_filename=     data_dir+"/{}_features_{}.p".format(feature,-1)
    labels_filename=     data_dir+"/labels_{}.p".format(-1)

    X_set_body=pickle.load(open( set_features_filename, "rb" ) )
    labels=pickle.load(open( labels_filename, "rb" ) )
    
    n_body=X_set_body.shape[1]
    
    if n!=-1:
        labels=labels[:n]
        X_set_body=X_set_body[:n]
    
    # load subject features
    data_dir = "subject_feature"

    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        X_set_subject=X_set_subject[:n]
        
    n_sub=X_set_subject.shape[1]
    
    # load structural info features
    StructuralInfo=pd.read_csv("email_05_structure.csv",index_col=[0])
    StructuralInfo=StructuralInfo.iloc[:,:-1]
    if n!=-1:
        StructuralInfo=StructuralInfo.values[:n]
    
    StructuralInfo=csr_matrix(StructuralInfo)
    
    n_structural=StructuralInfo.shape[1]
    
    print(n_body,n_sub,n_structural)
    
    # Combine features together
    X_set=hstack([X_set_body,X_set_subject,StructuralInfo])
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    auclst=[]
    accuracylst=[]
    for a in alphalst:
        model=CombinedNB(n_body,n_sub,n_structural,a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best alpha is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc of NB is",np.max(auclst))
    print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])
    
    model=CombinedNB(n_body,n_sub,n_structural,a=alphalst[np.argmax(auclst)])
    model.fit(x_train,y_train)
    
    with open("./models/CombinedProbNB_{}.p".format(feature),"wb") as f:
        pickle.dump(model,f)
        
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
combined_model2(feature="set",n=-1)
print("----------------------------------------------------------------")
combined_model2(feature="count",n=-1)
print("----------------------------------------------------------------")
combined_model2(feature="tfidf",n=-1)

----------------------------------------------------------------
575000 36750 4
set feature: 
[0.9720586658075551, 0.9726803519530713, 0.9725788432233811, 0.9711517052243034, 0.9679046551659253, 0.9519596815435761, 0.9280510762445692]
The best alpha is 0.001
The 5-fold auc of NB is 0.9726803519530713
The 5-fold accuracy of NB is 0.959252711882713
auc on test set is 0.9745628741523172
Accuracy on test set is 0.9618183240747985
----------------------------------------------------------------
575000 36750 4
count feature: 
[0.9694473479130193, 0.9695855284107399, 0.9682981739748577, 0.9652758984364425, 0.960652040819204, 0.9503543436372202, 0.9263706546402604]
The best alpha is 0.001
The 5-fold auc of NB is 0.9695855284107399
The 5-fold accuracy of NB is 0.9561320956255921
auc on test set is 0.971758171427846
Accuracy on test set is 0.9581409905328222
----------------------------------------------------------------
575116 36750 4
tfidf feature: 
[0.9931440950893163, 0.9937148828526888, 0.

(<72422x611870 sparse matrix of type '<class 'numpy.float64'>'
 	with 7510807 stored elements in Compressed Sparse Row format>,
 <12781x611870 sparse matrix of type '<class 'numpy.float64'>'
 	with 1341874 stored elements in Compressed Sparse Row format>,
 array([1, 0, 1, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64))