In [1]:
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing


# Load the data
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

# Define the steps of the model
model = pipeline.Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('lin_reg', linear_model.LogisticRegression(solver='lbfgs'))
])

# Define a determistic cross-validation procedure
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

# Compute the MSE values
scorer = metrics.make_scorer(metrics.roc_auc_score)
scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv)

# Display the average score and it's standard deviation
print(f'ROC AUC: {scores.mean():.3f} (± {scores.std():.3f})')

for xi, yi in zip(X, y):
    xi = dict(zip(dataset.feature_names, xi))
    pass

xi

ROC AUC: 0.975 (± 0.011)


{'mean radius': 7.76,
 'mean texture': 24.54,
 'mean perimeter': 47.92,
 'mean area': 181.0,
 'mean smoothness': 0.05263,
 'mean compactness': 0.04362,
 'mean concavity': 0.0,
 'mean concave points': 0.0,
 'mean symmetry': 0.1587,
 'mean fractal dimension': 0.05884,
 'radius error': 0.3857,
 'texture error': 1.428,
 'perimeter error': 2.548,
 'area error': 19.15,
 'smoothness error': 0.007189,
 'compactness error': 0.00466,
 'concavity error': 0.0,
 'concave points error': 0.0,
 'symmetry error': 0.02676,
 'fractal dimension error': 0.002783,
 'worst radius': 9.456,
 'worst texture': 30.37,
 'worst perimeter': 59.16,
 'worst area': 268.6,
 'worst smoothness': 0.08996,
 'worst compactness': 0.06444,
 'worst concavity': 0.0,
 'worst concave points': 0.0,
 'worst symmetry': 0.2871,
 'worst fractal dimension': 0.07039}

In [3]:
from river import stream

n, mean, sum_of_squares, variance = 0, 0, 0, 0

for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer()):
    n += 1
    old_mean = mean
    mean += (xi['mean area'] - mean) / n
    sum_of_squares += (xi['mean area'] - old_mean) * (xi['mean area'] - mean)
    variance = sum_of_squares / n

print(f'Running mean: {mean:.3f}')
print(f'Running variance: {variance:.3f}')

Running mean: 654.889
Running variance: 123625.903


In [4]:
import numpy as np

i = list(dataset.feature_names).index('mean area')
print(f'True mean: {np.mean(X[:, i]):.3f}')
print(f'True variance: {np.var(X[:, i]):.3f}')

True mean: 654.889
True variance: 123625.903


In [5]:
from river import preprocessing

scaler = preprocessing.StandardScaler()

for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer()):
    scaler.learn_one(xi)

In [6]:
from river import linear_model
from river import optim

scaler = preprocessing.StandardScaler()
optimizer = optim.SGD(lr=0.01)
log_reg = linear_model.LogisticRegression(optimizer)

y_true = []
y_pred = []

for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer(), shuffle=True, seed=42):

    # Scale the features
    scaler.learn_one(xi)
    xi_scaled = scaler.transform_one(xi)

    # Test the current model on the new "unobserved" sample
    yi_pred = log_reg.predict_proba_one(xi_scaled)
    # Train the model with the new sample
    log_reg.learn_one(xi_scaled, yi)

    # Store the truth and the prediction
    y_true.append(yi)
    y_pred.append(yi_pred[True])

print(f'ROC AUC: {metrics.roc_auc_score(y_true, y_pred):.3f}')

ROC AUC: 0.990


In [7]:
from river import compat
from river import compose

# We define a Pipeline, exactly like we did earlier for sklearn 
model = compose.Pipeline(
    ('scale', preprocessing.StandardScaler()),
    ('log_reg', linear_model.LogisticRegression())
)

# We make the Pipeline compatible with sklearn
model = compat.convert_river_to_sklearn(model)

# We compute the CV scores using the same CV scheme and the same scoring
scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv)

# Display the average score and it's standard deviation
print(f'ROC AUC: {scores.mean():.3f} (± {scores.std():.3f})')

ROC AUC: 0.964 (± 0.016)
