## Dataset loading and separation test data and train

In [1]:
import sklearn

from sklearn import datasets
from sklearn import datasets
from sklearn.model_selection import train_test_split

email_set = datasets.load_files("data/")

RATIO_TRAINING = 0.7

X_train, X_test, Y_train, Y_test = train_test_split(email_set.data, email_set.target, 
                                                            test_size=(1-RATIO_TRAINING), shuffle=True)

## Classifier training

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


bayes_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

bayes_clf.fit(X_train, Y_train)
bayes_predicted = bayes_clf.predict(X_test)

## Test scores

In [3]:
from sklearn.metrics import classification_report


print("Naive Bayes")
print("------------------------------------------------------------")
print(classification_report(Y_test, bayes_predicted))

Naive Bayes
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       146
           1       0.97      0.99      0.98       143

   micro avg       0.98      0.98      0.98       289
   macro avg       0.98      0.98      0.98       289
weighted avg       0.98      0.98      0.98       289



### Test inputs

In [4]:
test_data = ['language speaker', 'have sex with ladies', 'student historian', 'explode',
            'Hi, the conference will be held in Bern tomorrow at 12 PM.',
           'Please find attached the project\'s specification. Call me if you need anything.',
           'You need to reset your account at "http://www.bank.russia.po.biz/useraccount"',
           'Earn 3841 dollars a month with this simple trick !',
           'Hot girls waiting for your meat scepter in your area',
           'Enlarge your webgl in 22 weeks with our new feature']


predicted = bayes_clf.predict(test_data)

for doc, category in zip(test_data, predicted):
    print('%r => %s' % (doc, email_set.target_names[category]))

'language speaker' => non-spam
'have sex with ladies' => spam
'student historian' => non-spam
'explode' => spam
'Hi, the conference will be held in Bern tomorrow at 12 PM.' => spam
"Please find attached the project's specification. Call me if you need anything." => spam
'You need to reset your account at "http://www.bank.russia.po.biz/useraccount"' => spam
'Earn 3841 dollars a month with this simple trick !' => spam
'Hot girls waiting for your meat scepter in your area' => spam
'Enlarge your webgl in 22 weeks with our new feature' => spam
