In [2]:
pip install river

Collecting river
  Downloading river-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting pandas<3.0.0,>=2.2.3 (from river)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<2.0.0,>=1.14.1 (from river)
  Downloading scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading river-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [3]:
import collections
from river import datasets
import pandas as pd
from river import stream
from river import compose
from river import metrics
from river import preprocessing
from river import linear_model
from river import optim
from river import imblearn



In [5]:
data = pd.read_csv('surge_data.csv')

In [6]:
X = data[['voltage', 'current', 'power_consumption']]
y = data['flag']

In [8]:
metric = metrics.ROCAUC()

In [9]:
confusion_matrix_metric = metrics.ConfusionMatrix()

## Model (Logistic Regression)

In [10]:
model = compose.Pipeline(
    ('scaler', preprocessing.StandardScaler()),
    ('classifier', linear_model.LogisticRegression())
)

In [11]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model.predict_one(xi)
    model.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [12]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 49.99%
Confusion Matrix:
      0.0     1.0  
0.0   9,930     1  
1.0      69     0  


## Importance weighting

In [13]:
model1 = compose.Pipeline(
    ('scaler', preprocessing.StandardScaler()),
    ('classifier', linear_model.LogisticRegression(loss=optim.losses.Log(weight_pos=5)))
)

In [17]:
metric = metrics.ROCAUC()

In [18]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [19]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model1.predict_one(xi)
    model1.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [20]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 85.48%
Confusion Matrix:
      0.0     1.0  
0.0   9,925     6  
1.0      20    49  


## Focal loss

In [21]:
model3 = compose.Pipeline(
    ('scaler', preprocessing.StandardScaler()),
    ('classifier', linear_model.LogisticRegression(loss=optim.losses.BinaryFocalLoss(2, 1)))
)

In [22]:
metric = metrics.ROCAUC()

In [23]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [24]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model3.predict_one(xi)
    model3.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [25]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 49.99%
Confusion Matrix:
      0.0     1.0  
0.0   9,930     1  
1.0      69     0  


## Under-sampling the majority class

In [26]:
model4 = (
    preprocessing.StandardScaler() |
    imblearn.RandomUnderSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
        seed=42
    )
)


In [27]:
metric = metrics.ROCAUC()

In [28]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [29]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model3.predict_one(xi)
    model3.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [30]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 52.90%
Confusion Matrix:
      0.0     1.0  
0.0   9,931     0  
1.0      65     4  


## Over-sampling the minority class

In [31]:
model5 = (
    preprocessing.StandardScaler() |
    imblearn.RandomOverSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
        seed=42
    )
)

In [32]:
metric = metrics.ROCAUC()

In [33]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [34]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model5.predict_one(xi)
    model5.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [35]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 97.38%
Confusion Matrix:
      0.0     1.0  
0.0   9,699   232  
1.0       2    67  


## Sampling with a desired sample size

In [36]:
model6 = (
    preprocessing.StandardScaler() |
    imblearn.RandomSampler(
        classifier=linear_model.LogisticRegression(),
        desired_dist={0: .8, 1: .2},
        sampling_rate=.01,
        seed=42
    )
)


In [37]:
metric = metrics.ROCAUC()

In [38]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [39]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model6.predict_one(xi)
    model6.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [40]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 92.08%
Confusion Matrix:
      0.0     1.0    
0.0   8,357   1,574  
1.0       0      69  


## Hybrid approach

In [41]:
model7 = (
    preprocessing.StandardScaler() |
    imblearn.RandomUnderSampler(
        classifier=linear_model.LogisticRegression(
            loss=optim.losses.Log(weight_pos=5)
        ),
        desired_dist={0: .8, 1: .2},
        seed=42
    )
)

In [42]:
metric = metrics.ROCAUC()

In [43]:
confusion_matrix_metric = metrics.ConfusionMatrix()

In [44]:
for xi, yi in zip(X.to_dict(orient='records'), y):
    y_pred = model6.predict_one(xi)
    model6.learn_one(xi, yi)
    metric.update(yi, y_pred)
    confusion_matrix_metric.update(yi, y_pred)

In [45]:
print(f'Final accuracy: {metric}')
print("Confusion Matrix:")
print(confusion_matrix_metric)

Final accuracy: ROCAUC: 93.76%
Confusion Matrix:
      0.0     1.0    
0.0   8,692   1,239  
1.0       0      69  
