In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from itertools import compress
import random
from random import sample
import pickle
import autosklearn.classification
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_recall_fscore_support
import glob
from time import time
from sklearn.model_selection import train_test_split
from skmultiflow.meta import AdaptiveRandomForestClassifier,LearnPPNSEClassifier
from skmultiflow.drift_detection import EDDM, HDDM_A
import math

rng = np.random.default_rng(4711)

# Artifact


1.   Artifact with Classifier
2.   Pure Detector



In [None]:
class Artifact():

  def __init__(self, training_data, model, batch_size = 500):
    self.eddm = EDDM()
    self.hddm = HDDM_A()
    self.training_data = training_data
    self.model = model
    self.training_size = len(training_data)
    self.corpus_save_DataFrame = pd.DataFrame()
    self.inner_PWI = None
    self.general_corpus = None
    self.data_buffer = training_data
    self.batch_size = batch_size
    self.batch_len = 0 
    self.test_corpora = 2
    self.change_detected = False

    # vectorizer definition
    german_stop_words_import = open ("/LanguageDriftDetection/Data/Stopwords/stopwords-de.csv", 
                                 "r", encoding="utf-8")
    german_stop_words = pd.read_csv(german_stop_words_import,
                                names=['stopwords'])['stopwords'].to_list()
    self.vectorizer = TfidfVectorizer(stop_words=german_stop_words, 
                             ngram_range=(1,2), max_features=3000)
    self.vectorizer = self.vectorizer.fit(training_data.Text)
    
    self.set_training_corpus(training_data)


  def predict(self, x):
    if isinstance(x, str):
      x = vectorizer.transform([x]).toarray()

    return self.model.predict(x)[0]

  def add_data(self, x, y):
    input = pd.DataFrame(data={'Text':[x], 'Abgelehnt':[y]})
    self.data_buffer = pd.concat([self.data_buffer, input], ignore_index=True)
    self.batch_len += 1
    y_pred = self.predict(x)
    error = abs(y_pred - y)
    adwinError = abs(abs(y_pred-y)-1)

    self.eddm.add_element(error)
    self.hddm.add_element(error)

    if (self.change_detected == False) & ((self.eddm.detected_change() or self.hddm.detected_change())):
      self.change_detected = True


    if (self.batch_len >= self.batch_size) & (self.change_detected == True):
      print('Refitting a batch of size ', self.batch_len)
      #test attempt without WI
      # training_size = len(self.training_data)
      # self.training_data = pd.concat([self.training_data, self.data_buffer])[:-training_size]
      # print(self.training_data)
      self.set_training_corpus(self.data_buffer)
      self.data_buffer = self.training_data
      x_train = self.vectorizer.fit_transform(self.training_data.Text).toarray()
      self.model.reset()
      self.model.partial_fit(x_train, self.training_data.Abgelehnt, [0,1])
      self.batch_len = 0
      self.change_detected = False

    
  #  sets the general corpus for the WI and calculates Inner WI for thresholding
  def set_training_corpus(self, corpus):
    
    # intital setting of base corpus and PWI
    if self.corpus_save_DataFrame.empty:
      self.corpus_save_DataFrame = corpus
      self.test_corpora = int(len(self.corpus_save_DataFrame)/self.batch_size)
      self.general_corpus = self.splitPivotText(corpus)
      self.inner_PWI = self.calcPWI(corpus)


    # updating base corpus with new training data
    else:
      if self.test_corpora <= 1:
        self.test_corpora = 2 

      # adding new Corpus to Base Corpus to create Baseline
      # if the new data set is closer to the dictionary than the existing one no new data will be added to the training set to avoid overfitting
      compare_PWI = self.calcPWI(pd.concat([self.corpus_save_DataFrame, corpus]))
      if compare_PWI > self.inner_PWI:
        self.inner_PWI = compare_PWI
      samples = np.array_split(self.corpus_save_DataFrame.sample(frac=1), self.test_corpora)

      for sample in samples:
        sample_corpus = pd.concat([sample, corpus])
        test_corpus = self.splitPivotText(sample_corpus)

        test_PWI = self.calcPWI(sample)

        if test_PWI < self.inner_PWI:
          print('New General Corpus set. Inner PWI at: ', test_PWI)
          self.general_corpus = test_corpus
          self.training_data = sample.reset_index(drop=True)
          self.inner_PWI = test_PWI

  def calcPWI(self, corpus):
    general_Corpus, specialized_Corpus = train_test_split(corpus, test_size=0.5, random_state=4711) 
    pwi_frame = self.calcPWIFrame(self.splitPivotText(general_Corpus), self.splitPivotText(specialized_Corpus))
    pwi = pwi_frame.WI_Time.sum() / pwi_frame.WI_Time.count()

    return pwi

  def splitPivotText(self, df):
    functionDF =  df.copy()
    functionDF['Text'] = functionDF['Text'].astype(str)
    functionDF['Text'] = functionDF['Text'].apply(lambda x: x.split())
    splitFrame = pd.DataFrame(functionDF['Text'].to_list())
    splitFrame['Label'] = functionDF.reset_index().Abgelehnt

    # create one column with every word and their occurence with labels
    vocab = splitFrame.melt(id_vars='Label').drop(columns=['variable'])
    # turns the label column into two separate columns and aggregates them
    vocab['Removed'] = np.where(vocab['Label'] == 1, 1,0)
    vocab['not_Removed'] = np.where(vocab['Label'] == np.nan, 0,1)
    vocab = vocab.drop(columns=['Label'])

    vocab['Removed'] = vocab.groupby('value')['Removed'].transform('sum')
    vocab['not_Removed'] = vocab.groupby('value')['not_Removed'].transform('sum')
    returnDF = vocab.drop_duplicates(subset=['value'])

    returnDF = returnDF[returnDF['value'].notnull()]

    return returnDF



  def calcPWIFrame(self, generalCorpus, specializedCorpus):

    mergedPivot = specializedCorpus.set_index('value').join(generalCorpus.set_index('value'), how='left', lsuffix='_specialized', rsuffix='_corpus').fillna(0).reset_index()
    ts_time = specializedCorpus.Removed.sum() + specializedCorpus.not_Removed.sum()
    tg_time = generalCorpus.Removed.sum() + generalCorpus.Removed.sum()  


    mergedPivot['WI_Time'] = mergedPivot.apply(lambda x: self.calculate_polarized_pwi(x['Removed_specialized'] + x['not_Removed_specialized'], x['Removed_corpus'] + x['not_Removed_corpus'], ts_time, tg_time),axis=1)
    
    return mergedPivot

 
 
  def calculate_polarized_pwi(self, ws, wg, ts, tg):

    if((ws > 0) & (wg > 0)):  
      pwi = (ws/ts)/(wg/tg)

    elif(ws > 0):
      pwi = ws/ts

    elif(wg > 0):
      pwi = ws / (wg/tg)

    else:
      pwi = 0

    if np.isnan(pwi):
      pwi = 0

    return pwi

# Detector

In [None]:
class detectorEnsemble():

  def __init__(self, training_data, model, batch_size = 500):
    self.eddm = EDDM()
    self.hddm = HDDM_A()
    self.training_data = training_data
    self.model = model
    self.training_size = len(training_data)
    self.corpus_save_DataFrame = pd.DataFrame()
    self.inner_PWI = None
    self.general_corpus = None
    self.data_buffer = training_data
    self.batch_size = batch_size
    self.batch_len = 0 
    self.test_corpora = 2
    self.change_detected = False


     # vectorizer definition
    german_stop_words_import = open ("/LanguageDriftDetection/Data/Stopwords/stopwords-de.csv", 
                                 "r", encoding="utf-8")
    german_stop_words = pd.read_csv(german_stop_words_import,
                                names=['stopwords'])['stopwords'].to_list()
    self.vectorizer = TfidfVectorizer(stop_words=german_stop_words, 
                             ngram_range=(1,2), max_features=3000)
    self.vectorizer = self.vectorizer.fit(training_data.Text)
    
    self.set_training_corpus(training_data)


    def add_data(self, error):

    input = pd.DataFrame(data={'Text':[x], 'Abgelehnt':[y]})
    self.data_buffer = pd.concat([self.data_buffer, input], ignore_index=True)
    self.batch_len += 1

    self.eddm.add_element(self.error)
    self.hddm.add_element(self.error)

    if (self.change_detected == False) & ((self.eddm.detected_change() or self.hddm.detected_change())):
      self.change_detected = True


    if (self.batch_len >= self.batch_size) & (self.change_detected == True):
      print('Refitting a batch of size ', self.batch_len)
      self.set_training_corpus(self.data_buffer)
      self.data_buffer = self.training_data
      x_train = self.vectorizer.fit_transform(self.training_data.Text).toarray()
      self.batch_len = 0
      self.change_detected = False
      
      return True

    else:
      return False
    
     

     def set_training_corpus(self, corpus):
    
    # intital setting of base corpus and PWI
    if self.corpus_save_DataFrame.empty:
      self.corpus_save_DataFrame = corpus
      self.test_corpora = int(len(self.corpus_save_DataFrame)/self.batch_size)
      self.general_corpus = self.splitPivotText(corpus)
      self.inner_PWI = self.calcPWI(corpus)


    # updating base corpus with new training data
    else:
      if self.test_corpora <= 1:
        self.test_corpora = 2 

      # adding new Corpus to Base Corpus to create Baseline
      # if the new data set is closer to the dictionary than the existing one no new data will be added to the training set to avoid overfitting
      compare_PWI = self.calcPWI(pd.concat([self.corpus_save_DataFrame, corpus]))
      if compare_PWI > self.inner_PWI:
        self.inner_PWI = compare_PWI
      samples = np.array_split(self.corpus_save_DataFrame.sample(frac=1), self.test_corpora)

      for sample in samples:
        sample_corpus = pd.concat([sample, corpus])
        test_corpus = self.splitPivotText(sample_corpus)

        test_PWI = self.calcPWI(sample)

        if test_PWI < self.inner_PWI:
          print('New General Corpus set. Inner PWI at: ', test_PWI)
          self.general_corpus = test_corpus
          self.training_data = sample.reset_index(drop=True)
          self.inner_PWI = test_PWI

  def calcPWI(self, corpus):
    general_Corpus, specialized_Corpus = train_test_split(corpus, test_size=0.5, random_state=4711) 
    pwi_frame = self.calcPWIFrame(self.splitPivotText(general_Corpus), self.splitPivotText(specialized_Corpus))
    pwi = pwi_frame.WI_Time.sum() / pwi_frame.WI_Time.count()

    return pwi

  def splitPivotText(self, df):
    functionDF =  df.copy()
    functionDF['Text'] = functionDF['Text'].astype(str)
    functionDF['Text'] = functionDF['Text'].apply(lambda x: x.split())
    splitFrame = pd.DataFrame(functionDF['Text'].to_list())
    splitFrame['Label'] = functionDF.reset_index().Abgelehnt

    # create one column with every word and their occurence with labels
    vocab = splitFrame.melt(id_vars='Label').drop(columns=['variable'])
    # turns the label column into two separate columns and aggregates them
    vocab['Removed'] = np.where(vocab['Label'] == 1, 1,0)
    vocab['not_Removed'] = np.where(vocab['Label'] == np.nan, 0,1)
    vocab = vocab.drop(columns=['Label'])

    vocab['Removed'] = vocab.groupby('value')['Removed'].transform('sum')
    vocab['not_Removed'] = vocab.groupby('value')['not_Removed'].transform('sum')
    returnDF = vocab.drop_duplicates(subset=['value'])

    returnDF = returnDF[returnDF['value'].notnull()]

    return returnDF



  def calcPWIFrame(self, generalCorpus, specializedCorpus):

    mergedPivot = specializedCorpus.set_index('value').join(generalCorpus.set_index('value'), how='left', lsuffix='_specialized', rsuffix='_corpus').fillna(0).reset_index()
    ts_time = specializedCorpus.Removed.sum() + specializedCorpus.not_Removed.sum()
    tg_time = generalCorpus.Removed.sum() + generalCorpus.Removed.sum()  


    mergedPivot['WI_Time'] = mergedPivot.apply(lambda x: self.calculate_polarized_pwi(x['Removed_specialized'] + x['not_Removed_specialized'], x['Removed_corpus'] + x['not_Removed_corpus'], ts_time, tg_time),axis=1)
    
    return mergedPivot

 
 
  def calculate_polarized_pwi(self, ws, wg, ts, tg):

    if((ws > 0) & (wg > 0)):  
      pwi = (ws/ts)/(wg/tg)

    elif(ws > 0):
      pwi = ws/ts

    elif(wg > 0):
      pwi = ws / (wg/tg)

    else:
      pwi = 0

    if np.isnan(pwi):
      pwi = 0

    return pwi