In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn import tree



In [12]:
dataset =  pd.read_csv("spam_ham_dataset.csv", usecols=[1,2,3])
dataset['label_num'] = [1 if label == 'ham' else 0 for label in dataset['label'] ]
dataset.head()

train_df = dataset[:1723]
test_df  = dataset[1723:3446]
valid_df = dataset[3446:5169]
print(train_df.shape, test_df.shape, valid_df.shape)
train_df.head()

(1723, 3) (1723, 3) (1723, 3)


Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,1
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",1
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",1
3,spam,"Subject: photoshop , windows , office . cheap ...",0
4,ham,Subject: re : indian springs\r\nthis deal is t...,1


In [13]:
test_df.head()

Unnamed: 0,label,text,label_num
1723,ham,Subject: nomination change on tejas effective ...,1
1724,ham,Subject: re : ferc training\r\nplease see the ...,1
1725,ham,Subject: southern union for may\r\n2100 83 st\...,1
1726,ham,"Subject: hpl nom for august 24 , 2000\r\n( see...",1
1727,spam,Subject: \r\ndiscount meds right from home\r\n...,0


In [14]:
valid_df.head()

Unnamed: 0,label,text,label_num
3446,ham,"Subject: buyback deals expire\r\ndaren ,\r\nju...",1
3447,spam,Subject: re : no more injections\r\n,0
3448,ham,"Subject: tri - c resources , governor bill dan...",1
3449,ham,"Subject: enron / hpl actuals for february 21 ,...",1
3450,ham,Subject: bridge errors\r\nkeep an eye on the b...,1


In [15]:
def spam_detector(train_df, valid_df, test_df):
    
    
    #Compute TFIFD
    corpus = train_df["text"]
    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1,2),max_features=36044) #initializing the vectorizer
    vectors = vectorizer.fit_transform(corpus)
    
    #EXTRACT FEATURE AND TARGET VARIABLES 
    X_train = vectors
    y_train = train_df['label_num']
    
    
    models = {"LR":LogisticRegression(random_state = 0), 
              "MNB":MultinomialNB(),
              "DTC":tree.DecisionTreeClassifier(random_state=0),
              "LSVM": svm.SVC(kernel='linear')
             }
    

    datasets = {
                "Valid":valid_df,
                "Test" :test_df
               }
    m= []
    pscores = []
    cms = []
    for key, model  in models.items():
        for key2, dataset in datasets.items(): 
            documents = dataset['text']
            X = vectorizer.fit_transform(documents) 
            y_true = dataset['label_num']
            model.fit(X_train,y_train)
            y_pred = model.predict(X)
            cm = confusion_matrix(y_true, y_pred)
            pscore = precision_score(y_true, y_pred)
            #create dictionary
            m.append(key)
            pscores.append(pscore)
            cms.append(cm)
            

    return (m, pscores, cms)


m, pscores, cms = spam_detector(train_df, valid_df, test_df)


In [16]:
def build_dictionary(m,cms,pscores):
    i = 0
    result = {}
    
    best_model = ''
    pscores.sort()
    highest = pscores[0]
    
    for model in set(m):
        name = model
        valid_cm = cms[i]
        test_cm = cms[i+1]
        valid_pscore = pscores[i]
        test_pscore = pscores[i+1]
        i+=2
        result[name] = {"Valid CM":valid_cm, "Test CM": test_cm,
                        "Valid Score": valid_pscore, "Test Score": test_pscore
                       }
        if valid_pscore or test_pscore == highest:
            best_model = name
            
            
    return result,best_model

In [10]:
dic, best_model  = build_dictionary(m,cms,pscores)

df = pd.DataFrame.from_dict(dic, orient='index').reset_index()
df.rename(columns={'index':'Model'}, inplace = True)
df["Best Classifier"] = best_model  
df

Unnamed: 0,Model,Valid CM,Test CM,Valid Score,Test Score,Best Classifier
0,MNB,"[[7, 536], [15, 1165]]","[[14, 456], [3, 1250]]",0.684891,0.719093,DTC
1,LSVM,"[[159, 384], [197, 983]]","[[150, 320], [107, 1146]]",0.728643,0.732708,DTC
2,LR,"[[489, 54], [1035, 145]]","[[424, 46], [917, 336]]",0.781719,0.786517,DTC
3,DTC,"[[486, 57], [970, 210]]","[[405, 65], [881, 372]]",0.851259,0.879581,DTC
