## Logistic regression – DDM & EDDM

#DDM
Link to the tool: https://github.com/scikit-multiflow/scikit-multiflow/blob/a7e316d/src/skmultiflow/drift_detection/ddm.py#L6
<br>
Documentation: https://scikit-multiflow.readthedocs.io/en/stable/api/generated/skmultiflow.drift_detection.DDM.html
<br>
Paper reference: https://link.springer.com/chapter/10.1007/978-3-540-28645-5_29

#EDDM
Link to the tool: https://github.com/scikit-multiflow/scikit-multiflow/blob/a7e316d/src/skmultiflow/drift_detection/eddm.py#L6
<br>
Documentation: https://scikit-multiflow.readthedocs.io/en/stable/api/generated/skmultiflow.drift_detection.EDDM.html#skmultiflow.drift_detection.EDDM
<br>
Paper reference: https://www.researchgate.net/profile/Albert-Bifet/publication/245999704_Early_Drift_Detection_Method/links/53e582cd0cf21cc29fd06017/Early-Drift-Detection-Method.pdf


In [None]:
!pip install scikit-multiflow

Collecting scikit-multiflow
  Downloading scikit_multiflow-0.5.3-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 18.5 MB/s eta 0:00:01[K     |▋                               | 20 kB 22.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 7.7 MB/s eta 0:00:01[K     |█▏                              | 40 kB 8.8 MB/s eta 0:00:01[K     |█▌                              | 51 kB 7.0 MB/s eta 0:00:01[K     |█▊                              | 61 kB 7.7 MB/s eta 0:00:01[K     |██                              | 71 kB 7.6 MB/s eta 0:00:01[K     |██▍                             | 81 kB 8.4 MB/s eta 0:00:01[K     |██▋                             | 92 kB 6.3 MB/s eta 0:00:01[K     |███                             | 102 kB 6.4 MB/s eta 0:00:01[K     |███▏                            | 112 kB 6.4 MB/s eta 0:00:01[K     |███▌                            | 122 kB 6.4 MB/s eta 0:00:01[K     |███▉                         

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import math 
import csv
from skmultiflow.drift_detection.ddm import DDM
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Loading a dataset
#url = '/content/drive/MyDrive/Dataset/FinalDataBM_bcp-1.csv'
#url = '/content/drive/MyDrive/Dataset/FinalDataGM_bcp-1.csv'
#url = '/content/drive/MyDrive/Dataset/FinalDataBM_gender-1.csv'
#url = '/content/drive/MyDrive/Dataset/FinalDataGM_gender-1.csv'
#url = '/content/drive/MyDrive/Dataset/FinalDataBM_sm-1.csv'
#url = '/content/drive/MyDrive/Dataset/FinalDataGM_sm-1.csv'

#url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataBM_bcp.csv'
#url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataGM_bcp.csv'
#url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataBM_gender.csv'
#url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataGM_gender.csv'
#url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataBM_sm.csv'
url = '/content/drive/MyDrive/Dataset/Dataset_without0/FinalDataGM_sm.csv'

data_frame = pd.read_csv(url, delimiter=';')

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~data_frame.isin([np.nan, np.inf, -np.inf]).any(1)
    return data_frame[indices_to_keep].astype(np.float64)

data_frame = clean_dataset(data_frame)
# Splitting the dataset
#met_X, met_y = data_frame.drop(columns="bcp"), data_frame.bcp
#met_X, met_y = data_frame.drop(columns="gender"), data_frame.gender
met_X, met_y = data_frame.drop(columns="smoker"), data_frame.smoker

met_train_X, met_test_X, met_train_y, met_test_y = train_test_split(
    met_X, met_y, test_size=0.2, random_state=42
)

# Training of the classificator
pipeline = make_pipeline(SimpleImputer(), OneHotEncoder(handle_unknown='ignore'), LogisticRegression(max_iter=5000))
pipeline.fit(met_train_X, met_train_y)
predicted = pipeline.predict(met_test_X)
print(pipeline.score(met_test_X, met_test_y))

Mounted at /content/drive
0.7


In [None]:
ddm = DDM()

# Adding stream elements to DDM and verifying if drift occurred

for i in range(len(predicted)):
    ddm.add_element(predicted[i])
    if ddm.detected_warning_zone():
        print('Warning zone has been detected in data: ' + str(predicted[i]) + ' - of index: ' + str(i))
    if ddm.detected_change():
        print('Change detected in data: ' + str(predicted[i]) + ' - at index: ' + str(i))

  self.miss_std = np.sqrt(self.miss_prob * (1 - self.miss_prob) / float(self.sample_count))


In [None]:
#EDDM
from skmultiflow.drift_detection.eddm import EDDM
eddm = EDDM()

# Adding stream elements to EDDM and verifying if drift occurred

for i in range(len(predicted)):
    eddm.add_element(predicted[i])
    if eddm.detected_warning_zone():
        print('Warning zone has been detected in data: ' + str(predicted[i]) + ' - of index: ' + str(i))
    if eddm.detected_change():
        print('Change detected in data: ' + str(predicted[i]) + ' - at index: ' + str(i))