In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import sklearn
import numpy as np
import joblib

In [2]:
data = pd.read_csv('spam_or_not_spam.csv')
data.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


As we see data already lowercased, but not normalized (how to "threatens -> threaten"?) and has stop words like "a", "the", etc.

In [3]:
data = data.dropna() # we have only 1 nan row

In [4]:
sklearn.random.seed(42)
np.random.seed(42)

vectorizer1 = CountVectorizer(stop_words='english')
vectorizer2 = TfidfVectorizer(stop_words='english')

model1 = DecisionTreeClassifier(class_weight='balanced', random_state=42)
model2 = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model3 = MultinomialNB()

vectorizers = [vectorizer1, vectorizer2]
models = [model1, model2, model3]


model_grids = {'DecisionTreeClassifier': {'ccp_alpha': [0, 0.5, 1]},
               'LogisticRegression': {'C': [1, 1.5, 2]},
               'MultinomialNB': {'alpha': [1, 1.5, 2]}}


In [5]:
def TestModelAndVectorizer(vectorizer, model):
    X = np.array(data['email'].to_list()) 
    y = np.array(data['label'].to_list())
    grid = model_grids[model.__class__.__name__]
    
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.33, random_state=42)
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    gridSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=3,
	cv=3, scoring="f1")
    # use f1-score because of having unbalanced classes
    searchResults = gridSearch.fit(X_train, y_train)
    
    
    bestModel = searchResults.best_estimator_
    scores = cross_val_score(bestModel, X_test, y_test, cv=10, n_jobs=10, scoring='f1') 
    
    return bestModel, scores.mean(), scores.std()

In [6]:
def print_info(model, vectorizer, mean_acc, std):
    print(f"{model.__class__.__name__} - {vectorizer.__class__.__name__}| accuracy: {mean_acc} | std: {std}")

In [7]:
bestModel = None
bestScore = 0
for model in models:
    for vectorizer in vectorizers:
        model, score, std = TestModelAndVectorizer(vectorizer, model)
        print_info(model, vectorizer, score, std)
        if score > bestScore:
            bestModel = model
            bestScore = score

DecisionTreeClassifier - CountVectorizer| accuracy: 0.7798348390559193 | std: 0.06336291513788574
DecisionTreeClassifier - TfidfVectorizer| accuracy: 0.7061599960868266 | std: 0.07734290364046824
LogisticRegression - CountVectorizer| accuracy: 0.9460672392124005 | std: 0.042171396344804296
LogisticRegression - TfidfVectorizer| accuracy: 0.9124472116685688 | std: 0.04584759688061759
MultinomialNB - CountVectorizer| accuracy: 0.9071537819799778 | std: 0.06453416263085482
MultinomialNB - TfidfVectorizer| accuracy: 0.18871345029239767 | std: 0.0941088310249275


In [8]:
joblib.dump(bestModel, 'bestModel.pkl', compress=3)
# model = joblib.load('bestModel.pkl')

['bestModel.pkl']