In [1]:
import pandas as pd

data = pd.read_csv('spam_or_not_spam.csv')
data.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


As we see data already lowercased, but not normalized (how to "threatens -> threaten"?) and has stop words like "a", "the", etc.

In [2]:
data = data.dropna() # we have only 1 nan row

In [3]:
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import sklearn
import numpy as np

sklearn.random.seed(42)
np.random.seed(42)

vectorizer1 = CountVectorizer(stop_words='english')
vectorizer2 = TfidfVectorizer(stop_words='english')

model1 = DecisionTreeClassifier(class_weight='balanced')
model2 = LogisticRegression(max_iter=1000, class_weight='balanced')
model3 = MultinomialNB()

vectorizers = [vectorizer1, vectorizer2]
models = [model1, model2, model3]


model_grids = {'DecisionTreeClassifier': {'ccp_alpha': [0, 0.5, 1]},
               'LogisticRegression': {'C': [1, 1.5, 2]},
               'MultinomialNB': {'alpha': [1, 1.5, 2]}}



In [4]:
def TestModelAndVectorizer(vectorizer, model):
    X = vectorizer.fit_transform(list(data['email'])).toarray() 
    y = np.array(data['label'].to_list())
    grid = model_grids[model.__class__.__name__]
    
    cvFold = RepeatedKFold(n_splits=3, n_repeats=3, random_state=1)
    gridSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=3,
	cv=cvFold, scoring="f1")
    # use f1-score because of having unbalanced classes
    searchResults = gridSearch.fit(X, y)
    bestModel = searchResults.best_estimator_
    scores = cross_val_score(bestModel, X, y, cv=10, n_jobs=10, scoring='f1') 
    
    return bestModel, scores.mean(), scores.std()

In [5]:
def print_info(vectorizer, model, mean_acc, std):
    print(f"{model.__class__.__name__} - {vectorizer.__class__.__name__}| accuracy: {mean_acc} | std: {std}")

In [6]:
bestModel = None
bestScore = 0
for model in models:
    for vectorizer in vectorizers:
        model, score, std = TestModelAndVectorizer(vectorizer, model)
        print_info(model, vectorizer, score, std)
        if score > bestScore:
            bestModel = model
            bestScore = score

CountVectorizer - DecisionTreeClassifier| accuracy: 0.8113843101879391 | std: 0.0734539358983156
TfidfVectorizer - DecisionTreeClassifier| accuracy: 0.7981017653375025 | std: 0.09197458226015737
CountVectorizer - LogisticRegression| accuracy: 0.9577944756456105 | std: 0.05234733230790628
TfidfVectorizer - LogisticRegression| accuracy: 0.9412557724697919 | std: 0.03414918679302613
CountVectorizer - MultinomialNB| accuracy: 0.9467730629572838 | std: 0.030265750652902816
TfidfVectorizer - MultinomialNB| accuracy: 0.5441161507918728 | std: 0.10771507567246275


In [7]:
import joblib

joblib.dump(bestModel, 'bestModel.pkl', compress=3)

# model = joblib.load('bestModel.pkl')

['bestModel.pkl']