In [1]:
from river import datasets

datasets.SMSSpam()

from pprint import pprint

X_y = datasets.SMSSpam()

for x, y in X_y:
    pprint(x)
    print(f'Spam: {y}')
    break
    
from river import feature_extraction
from river import naive_bayes
from river import metrics

X_y = datasets.SMSSpam()

model = (
    feature_extraction.TFIDF(on='body') | 
    naive_bayes.BernoulliNB(alpha=0)
)

metric = metrics.ROCAUC()
cm = metrics.ConfusionMatrix()

for x, y in X_y:

    y_pred = model.predict_one(x)

    if y_pred is not None:
        metric.update(y_pred=y_pred, y_true=y)
        cm.update(y_pred=y_pred, y_true=y)

    model.learn_one(x, y)

metric

Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Uncompressing into /home/jbris/river_data/SMSSpam
{'body': 'Go until jurong point, crazy.. Available only in bugis n great world '
         'la e buffet... Cine there got amore wat...\n'}
Spam: False


ROCAUC: 93.00%

In [2]:
from river import imblearn

X_y = datasets.SMSSpam()

model = (
    feature_extraction.TFIDF(on='body') | 
    imblearn.RandomUnderSampler(
        classifier=naive_bayes.BernoulliNB(alpha=0),
        desired_dist={0: .5, 1: .5},
        seed=42
    )
)

metric = metrics.ROCAUC()
cm = metrics.ConfusionMatrix()

for x, y in X_y:

    y_pred = model.predict_one(x)

    if y_pred is not None:
        metric.update(y_pred=y_pred, y_true=y)
        cm.update(y_pred=y_pred, y_true=y)

    model.learn_one(x, y)

metric



ROCAUC: 94.61%

In [3]:
from river import linear_model
from river import optim
from river import preprocessing

X_y = datasets.SMSSpam()

model = (
    feature_extraction.TFIDF(on='body') | 
    preprocessing.Normalizer() | 
    imblearn.RandomUnderSampler(
        classifier=linear_model.LogisticRegression(
            optimizer=optim.SGD(.9), 
            loss=optim.losses.Log()
        ),
        desired_dist={0: .5, 1: .5},
        seed=42
    )
)

metric = metrics.ROCAUC()
cm = metrics.ConfusionMatrix()

for x, y in X_y:

    y_pred = model.predict_one(x)

    metric.update(y_pred=y_pred, y_true=y)
    cm.update(y_pred=y_pred, y_true=y)

    model.learn_one(x, y)

metric

ROCAUC: 93.80%