In [1]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
import pandas

In [3]:
def perform(classifiers, vectorizers, train_data, test_data):
    for classifier in classifiers:
        for vectorizer in vectorizers:
            string = ''
            string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__

            # Train
            vectorize_text = vectorizer.fit_transform(train_data.v2)
            classifier.fit(vectorize_text, train_data.v1)

            # Score
            vectorize_text = vectorizer.transform(test_data.v2)
            score = classifier.score(vectorize_text, test_data.v1)
            string += '. Has score: ' + str(score)
            print(string)

# Open data-set and divide it
data = pandas.read_csv('spam.csv', encoding='latin-1')

# Perform a 70:30 split
train_size = int(len(data) * 0.7)  # Calculate 70% of the dataset
train_data = data[:train_size]    # First 70% for training
test_data = data[train_size:]     # Remaining 30% for testing

# Call perform function
perform(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    train_data,
    test_data
)

BernoulliNB with CountVectorizer. Has score: 0.9760765550239234
BernoulliNB with TfidfVectorizer. Has score: 0.9760765550239234
BernoulliNB with HashingVectorizer. Has score: 0.8636363636363636
RandomForestClassifier with CountVectorizer. Has score: 0.972488038277512
RandomForestClassifier with TfidfVectorizer. Has score: 0.9712918660287081
RandomForestClassifier with HashingVectorizer. Has score: 0.9647129186602871
AdaBoostClassifier with CountVectorizer. Has score: 0.9198564593301436
AdaBoostClassifier with TfidfVectorizer. Has score: 0.9419856459330144
AdaBoostClassifier with HashingVectorizer. Has score: 0.9473684210526315
BaggingClassifier with CountVectorizer. Has score: 0.958732057416268
BaggingClassifier with TfidfVectorizer. Has score: 0.9641148325358851
BaggingClassifier with HashingVectorizer. Has score: 0.9688995215311005
ExtraTreesClassifier with CountVectorizer. Has score: 0.9778708133971292
ExtraTreesClassifier with TfidfVectorizer. Has score: 0.9754784688995215
ExtraTre