In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from itertools import compress
import random
from random import sample
import pickle
import autosklearn.classification
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_recall_fscore_support
import glob
from time import time
from sklearn.model_selection import train_test_split
import openpyxl
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.drift_detection import DDM, ADWIN, EDDM, HDDM_A, PageHinkley

rng = np.random.default_rng(4711)

# Data Prep for Tests

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
commentsDF = pd.DataFrame(columns={'ID'})
for file in glob.glob("/content/drive/MyDrive/MA/Daten/*"):
     tempDF = pd.read_excel(file)
     commentsDF = pd.concat([commentsDF, tempDF], join='outer')

commentsDF = commentsDF.drop(columns=['ID', 'Active', 'CID', 'Sperrhinweis', 'Loginname'],axis=1)

commentsDF['Date'] =  pd.to_datetime(commentsDF['Date'], format='%Y-%m-%d %H:%M:%S')
commentsDF['Time'] = pd.to_datetime(commentsDF['Date']).dt.time
commentsDF['Date'] = pd.to_datetime(commentsDF['Date']).dt.date
commentsDF['Date'] = pd.to_datetime(commentsDF['Date'], format='%Y-%m-%d')


commentsDF['Text'] = commentsDF['Text'].astype(str)
commentsDF['Title'] = commentsDF['Title'].astype(str)

commentsDF = commentsDF.drop_duplicates(subset=['Text']) 
commentsDF = commentsDF.sort_values(by=['Date']).reset_index(drop=True)

In [None]:
# creates Sampled down version of dataset

dateRange = pd.period_range(start='2018-11-01', end='2020-06-01', freq='M')
timeSampledDF = pd.DataFrame(columns={'ID'})


for i in range(len(dateRange)):
  month = dateRange[i].month
  year = dateRange[i].year
  monthData = commentsDF[(commentsDF.Date.dt.month == dateRange[i].month) & (commentsDF.Date.dt.year == dateRange[i].year)]
  countMinor = len(monthData[monthData.Abgelehnt == 1])
  if countMinor < 600:
    countMinor = 600
  minorityDF = monthData[monthData.Abgelehnt == 1]
  majorityDF = monthData[monthData.Abgelehnt == 0].sample(axis='index', 
                                                          n=countMinor, 
                                                          random_state=4711,)
  timeSampledDF = pd.concat([timeSampledDF, majorityDF], join='outer')
  timeSampledDF = pd.concat([timeSampledDF, minorityDF], join='outer')

y_vals = commentsDF['Abgelehnt'].values
y_vals = y_vals.astype(int)


commentsDF['index1'] = commentsDF.index
timeSampledDF['index1'] = timeSampledDF.index
timeSampledDF.sort_values(by='Date',inplace=True)

commentsDF['Mask'] = commentsDF.index1.apply(lambda x: 1 if x in timeSampledDF.index1 else 0)
mask = np.array(commentsDF['Mask']).astype('bool')
y_vals_sample = timeSampledDF['Abgelehnt'].values
y_vals_sample = y_vals_sample.astype(int)


# EWMA

In [None]:
class EWMA():
    """The Exponentially Weighted Moving Average (EWMA) drift detection method class."""
    
    def __init__(self, min_instance=30, lambda_=0.2, c=1, w=0.5):
        '''
        The Exponentially Weighted Moving Average (EWMA) drift detection method class.
        :param: min_instance: quantity of instance to start detect a concept drift
        '''

        self.MINIMUM_NUM_INSTANCES = min_instance

        self.t = 1.0
        self.sum = 0.0
        self.sigma_xt = 0.0
        self.sigma_zt = 0.0
        self.z_t = 0.0
        self.lambda_ = lambda_
        self.L_t = c
        self.w = w
        
    def fit(self, classifier, vectorizer, W):
        '''
        method to fit to the current concept 
        '''
        
        for obs in W:
            x, y = vectorizer.transform(obs[:-1]).toarray(), obs[-1]
            
            try:
                yi = classifier.predict(x)
            except:
                yi = classifier.predict([x])
            
            pred = True
            if(yi != y):
                pred = False
                
            self.run(pred)
    
    def run(self, prediction):
      '''
      method to update the parameters of ewma
      :param: prediction: true if the prediction is correct, otherwise no
      '''
      prediction = 1 if prediction is False else 0

      warning_status = False
      drift_status = False

      # 1. UPDATING STATS
      self.sum += prediction
      self.sigma_xt = self.sum / self.t
      self.sigma_zt = math.sqrt(self.sigma_xt * (1.0 - self.sigma_xt) * self.lambda_ * (1.0 - math.pow(1.0 - self.lambda_, 2.0 * self.t)) / (2.0 - self.lambda_))
      self.t += 1

      self.z_t += self.lambda_ * (prediction - self.z_t)
      #L_t = 3.97 - 6.56 * self.sigma_xt + 48.73 * math.pow(self.sigma_xt, 3) - 330.13 * math.pow(self.sigma_xt, 5) + 848.18 * math.pow(self.sigma_xt, 7)

      # 2. UPDATING WARNING AND DRIFT STATUSES
      if self.t < self.MINIMUM_NUM_INSTANCES:
        return False, False

      if self.z_t > self.sigma_xt + self.L_t * self.sigma_zt:
        drift_status = True
      elif self.z_t > self.sigma_xt + self.w * self.L_t * self.sigma_zt:
        warning_status = True

      return warning_status, drift_status


    def detect(self, y_true, y_pred):
        '''
        method to monitor the index
        '''
        
        # checkint out the prediction of classifier
        pred = True
        if(y_true != y_pred):
            pred = False
        
        warning_level, change_level = self.run(pred)
        
        return warning_level, change_level

    def reset(self):
        '''
        method to reset the detector
        '''
        
        self.t = 1
        self.sum = 0
        self.sigma_xt = 0
        self.sigma_zt = 0
        self.z_t = 0

# Artifact


1.   Artifact with Classifier
2.   Pure Detector



In [None]:
class Artifact():

  def __init__(self, training_data, model, batch_size = 500):
    self.eddm = EDDM()
    self.hddm = HDDM_A()
    self.training_data = training_data
    self.model = model
    self.training_size = len(training_data)
    self.corpus_save_DataFrame = pd.DataFrame()
    self.inner_PWI = None
    self.general_corpus = None
    self.data_buffer = training_data
    self.batch_size = batch_size
    self.batch_len = 0 
    self.test_corpora = 2
    self.change_detected = False

    # vectorizer definition
    german_stop_words_import = open ("/content/drive/MyDrive/MA/stopwords-de.csv", 
                                 "r", encoding="utf-8")
    german_stop_words = pd.read_csv(german_stop_words_import, 
                                names=['stopwords'])['stopwords'].to_list()
    self.vectorizer = TfidfVectorizer(stop_words=german_stop_words, 
                             ngram_range=(1,2), max_features=3000)
    self.vectorizer = self.vectorizer.fit(training_data.Text)
    
    self.set_training_corpus(training_data)


  def predict(self, x):
    if isinstance(x, str):
      x = vectorizer.transform([x]).toarray()

    return self.model.predict(x)[0]

  def add_data(self, x, y):
    input = pd.DataFrame(data={'Text':[x], 'Abgelehnt':[y]})
    self.data_buffer = pd.concat([self.data_buffer, input], ignore_index=True)
    self.batch_len += 1
    y_pred = self.predict(x)
    error = abs(y_pred - y)
    adwinError = abs(abs(y_pred-y)-1)

    self.eddm.add_element(error)
    self.hddm.add_element(error)

    if (self.change_detected == False) & ((self.eddm.detected_change() or self.hddm.detected_change())):
      self.change_detected = True


    if (self.batch_len >= self.batch_size) & (self.change_detected == True):
      print('Refitting a batch of size ', self.batch_len)
      #test attempt without WI
      # training_size = len(self.training_data)
      # self.training_data = pd.concat([self.training_data, self.data_buffer])[:-training_size]
      # print(self.training_data)
      self.set_training_corpus(self.data_buffer)
      self.data_buffer = self.training_data
      x_train = self.vectorizer.fit_transform(self.training_data.Text).toarray()
      self.model.reset()
      self.model.partial_fit(x_train, self.training_data.Abgelehnt, [0,1])
      self.batch_len = 0
      self.change_detected = False

    
  #  sets the general corpus for the WI and calculates Inner WI for thresholding
  def set_training_corpus(self, corpus):
    
    # intital setting of base corpus and PWI
    if self.corpus_save_DataFrame.empty:
      self.corpus_save_DataFrame = corpus
      self.test_corpora = int(len(self.corpus_save_DataFrame)/self.batch_size)
      self.general_corpus = self.splitPivotText(corpus)
      self.inner_PWI = self.calcPWI(corpus)


    # updating base corpus with new training data
    else:
      if self.test_corpora <= 1:
        self.test_corpora = 2 

      # adding new Corpus to Base Corpus to create Baseline
      # if the new data set is closer to the dictionary than the existing one no new data will be added to the training set to avoid overfitting
      compare_PWI = self.calcPWI(pd.concat([self.corpus_save_DataFrame, corpus]))
      if compare_PWI > self.inner_PWI:
        self.inner_PWI = compare_PWI
      samples = np.array_split(self.corpus_save_DataFrame.sample(frac=1), self.test_corpora)

      for sample in samples:
        sample_corpus = pd.concat([sample, corpus])
        test_corpus = self.splitPivotText(sample_corpus)

        test_PWI = self.calcPWI(sample)

        if test_PWI < self.inner_PWI:
          print('New General Corpus set. Inner PWI at: ', test_PWI)
          self.general_corpus = test_corpus
          self.training_data = sample.reset_index(drop=True)
          self.inner_PWI = test_PWI

  def calcPWI(self, corpus):
    general_Corpus, specialized_Corpus = train_test_split(corpus, test_size=0.5, random_state=4711) 
    pwi_frame = self.calcPWIFrame(self.splitPivotText(general_Corpus), self.splitPivotText(specialized_Corpus))
    pwi = pwi_frame.WI_Time.sum() / pwi_frame.WI_Time.count()

    return pwi

  def splitPivotText(self, df):
    functionDF =  df.copy()
    functionDF['Text'] = functionDF['Text'].astype(str)
    functionDF['Text'] = functionDF['Text'].apply(lambda x: x.split())
    splitFrame = pd.DataFrame(functionDF['Text'].to_list())
    splitFrame['Label'] = functionDF.reset_index().Abgelehnt

    # create one column with every word and their occurence with labels
    vocab = splitFrame.melt(id_vars='Label').drop(columns=['variable'])
    # turns the label column into two separate columns and aggregates them
    vocab['Removed'] = np.where(vocab['Label'] == 1, 1,0)
    vocab['not_Removed'] = np.where(vocab['Label'] == np.nan, 0,1)
    vocab = vocab.drop(columns=['Label'])

    vocab['Removed'] = vocab.groupby('value')['Removed'].transform('sum')
    vocab['not_Removed'] = vocab.groupby('value')['not_Removed'].transform('sum')
    returnDF = vocab.drop_duplicates(subset=['value'])

    returnDF = returnDF[returnDF['value'].notnull()]

    return returnDF



  def calcPWIFrame(self, generalCorpus, specializedCorpus):

    mergedPivot = specializedCorpus.set_index('value').join(generalCorpus.set_index('value'), how='left', lsuffix='_specialized', rsuffix='_corpus').fillna(0).reset_index()
    ts_time = specializedCorpus.Removed.sum() + specializedCorpus.not_Removed.sum()
    tg_time = generalCorpus.Removed.sum() + generalCorpus.Removed.sum()  


    mergedPivot['WI_Time'] = mergedPivot.apply(lambda x: self.calculate_polarized_pwi(x['Removed_specialized'] + x['not_Removed_specialized'], x['Removed_corpus'] + x['not_Removed_corpus'], ts_time, tg_time),axis=1)
    
    return mergedPivot

 
 
  def calculate_polarized_pwi(self, ws, wg, ts, tg):

    if((ws > 0) & (wg > 0)):  
      pwi = (ws/ts)/(wg/tg)

    elif(ws > 0):
      pwi = ws/ts

    elif(wg > 0):
      pwi = ws / (wg/tg)

    else:
      pwi = 0

    if np.isnan(pwi):
      pwi = 0

    return pwi

# EWMA Artifact

In [None]:
class ArtifactEWMA():

  def __init__(self, training_data, model, batch_size = 500):
    self.eddm = EDDM()
    self.ewma = EWMA()
    self.training_data = training_data
    self.model = model
    self.training_size = len(training_data)
    self.corpus_save_DataFrame = pd.DataFrame()
    self.inner_PWI = None
    self.general_corpus = None
    self.data_buffer = training_data
    self.batch_size = batch_size
    self.batch_len = 0 


    # vectorizer definition
    german_stop_words_import = open ("/content/drive/MyDrive/MA/stopwords-de.csv", 
                                 "r", encoding="utf-8")
    german_stop_words = pd.read_csv(german_stop_words_import, 
                                names=['stopwords'])['stopwords'].to_list()
    self.vectorizer = TfidfVectorizer(stop_words=german_stop_words, 
                             ngram_range=(1,2), max_features=3000)
    self.vectorizer = self.vectorizer.fit(training_data.Text)
    
    self.set_training_corpus(training_data)
    self.ewma.fit(self.model, self.vectorizer, training_data.to_numpy())



  def predict(self, x):
    if isinstance(x, str):
      x = vectorizer.transform([x]).toarray()

    return self.model.predict(x)[0]

  def add_data(self, x, y):
    input = pd.DataFrame(data={'Text':[x], 'Abgelehnt':[y]})
    self.data_buffer = pd.concat([self.data_buffer, input], ignore_index=True)
    self.batch_len += 1
    y_pred = self.predict(x)
    error = abs(y_pred - y)
    adwinError = abs(abs(y_pred-y)-1)

    self.eddm.add_element(error)
    warning_level, change_level = self.ewma.run(error)

    # self.ph.add_element(error)

    if (self.batch_len >= self.batch_size) & ((self.eddm.detected_change() or (change_level == True))):
      print('Refitting a batch of size ', self.batch_len)
      # print(self.data_buffer)
      training_size = len(self.training_data)
      self.training_data = pd.concat([self.training_data, self.data_buffer])[:-training_size]
      # self.set_training_corpus(self.data_buffer)
      # self.data_buffer = self.training_data
      x_train = self.vectorizer.fit_transform(self.training_data.Text).toarray()
      # self.model.reset()
      self.model.fit(x_train, self.training_data.Abgelehnt, [0,1])
      self.batch_len = 0

    
  #  sets the general corpus for the WI and calculates Inner WI for thresholding
  def set_training_corpus(self, corpus):
    
    # intital setting of base corpus and PWI
    if self.corpus_save_DataFrame.empty:
      self.corpus_save_DataFrame = corpus
      general_Corpus_sample, specialized_Corpus_sample = train_test_split(corpus, test_size=0.5, random_state=4711)

      self.general_corpus = self.splitPivotText(corpus)
      general_PWI = self.calcPWIFrame(self.splitPivotText(general_Corpus_sample), self.splitPivotText(specialized_Corpus_sample))
      self.inner_PWI = general_PWI.WI_Time.sum() / general_PWI.WI_Time.count()


    # updating base corpus with new training data
    else:
      test_corpora = int(len(self.corpus_save_DataFrame)/self.batch_size)
      if self.test_corpora <= 1:
        self.test_corpora = 2 
      # print('Save Frame and Corpus', self.corpus_save_DataFrame, corpus)
      samples = np.array_split(self.corpus_save_DataFrame.sample(frac=1), self.test_corpora)

      for sample in samples:
        sample_corpus = pd.concat([self.sample, corpus])
        test_corpus = self.splitPivotText(sample_corpus)
        general_Corpus_sample, specialized_Corpus_sample = train_test_split(sample, test_size=0.5, random_state=4711)
        general_PWI = self.calcPWIFrame(self.splitPivotText(general_Corpus_sample), self.splitPivotText(specialized_Corpus_sample))

        test_PWI =  general_PWI.WI_Time.sum() / general_PWI.WI_Time.count()

        if test_PWI < self.inner_PWI:
          print('New General Corpus set. Inner PWI at: ', test_PWI)
          print(sample)
          self.general_corpus = test_corpus
          self.training_data = sample.reset_index(drop=True)
          self.inner_PWI = test_PWI



  def splitPivotText(self, df):
    functionDF =  df.copy()
    functionDF['Text'] = functionDF['Text'].astype(str)
    functionDF['Text'] = functionDF['Text'].apply(lambda x: x.split())
    splitFrame = pd.DataFrame(functionDF['Text'].to_list())
    splitFrame['Label'] = functionDF.reset_index().Abgelehnt

    # create one column with every word and their occurence with labels
    vocab = splitFrame.melt(id_vars='Label').drop(columns=['variable'])
    # turns the label column into two separate columns and aggregates them
    vocab['Removed'] = np.where(vocab['Label'] == 1, 1,0)
    vocab['not_Removed'] = np.where(vocab['Label'] == np.nan, 0,1)
    vocab = vocab.drop(columns=['Label'])

    vocab['Removed'] = vocab.groupby('value')['Removed'].transform('sum')
    vocab['not_Removed'] = vocab.groupby('value')['not_Removed'].transform('sum')
    returnDF = vocab.drop_duplicates(subset=['value'])

    returnDF = returnDF[returnDF['value'].notnull()]

    return returnDF



  def calcPWIFrame(self, generalCorpus, specializedCorpus):

    mergedPivot = specializedCorpus.set_index('value').join(generalCorpus.set_index('value'), how='left', lsuffix='_specialized', rsuffix='_corpus').fillna(0).reset_index()
    ts_time = specializedCorpus.Removed.sum() + specializedCorpus.not_Removed.sum()
    tg_time = generalCorpus.Removed.sum() + generalCorpus.Removed.sum()  


    mergedPivot['WI_Time'] = mergedPivot.apply(lambda x: self.calculate_polarized_pwi(x['Removed_specialized'] + x['not_Removed_specialized'], x['Removed_corpus'] + x['not_Removed_corpus'], ts_time, tg_time),axis=1)
    
    return mergedPivot

 
 
  def calculate_polarized_pwi(self, ws, wg, ts, tg):

    if((ws > 0) & (wg > 0)):  
      pwi = (ws/ts)/(wg/tg)

    elif(ws > 0):
      pwi = ws/ts

    elif(wg > 0):
      pwi = ws / (wg/tg)

    else:
      pwi = 0

    if np.isnan(pwi):
      pwi = 0

    return pwi

In [None]:
class detectorEnsemble():

  def __init__(self):
    self.eddm = EDDM()
    self.adwin = ADWIN()
    

# Test


In [None]:
tfidf_pickle = open ("/LanguageDriftDetection/Data/Preprocessing/preprocessed_balanced.pk", "rb")
preprocessed = pickle.load(tfidf_pickle)

train_batch = np.array(timeSampledDF.Date < "2019-06-01")
test = timeSampledDF['Date'] >= "2019-06-01"


german_stop_words_import = open ("/LanguageDriftDetection/Data/Stopwords/stopwords-de.csv", 
                                 "r", encoding="utf-8")
german_stop_words = pd.read_csv(german_stop_words_import, 
                                names=['stopwords'])['stopwords'].to_list()
vectorizer = TfidfVectorizer(stop_words=german_stop_words, 
                             ngram_range=(1,2), max_features=3000)

x_timed = list(compress(preprocessed, train_batch))

y_train_timed = y_vals_sample[train_batch]
x_train_timed = vectorizer.fit_transform(x_timed).toarray()

trainMask_3m = (timeSampledDF.Date < "2019-06-01") & (timeSampledDF.Date >= "2019-05-01")
x_3m = list(compress(preprocessed, trainMask_3m))

x_train_3m = vectorizer.fit_transform(x_3m).toarray()
y_train_3m = y_vals_sample[trainMask_3m]

timeSampledDF.Text = preprocessed
training_data = timeSampledDF[trainMask_3m].reset_index(drop=True).loc[:,['Text', 'Abgelehnt']]

In [None]:
model = AdaptiveRandomForestClassifier(drift_detection_method = None, warning_detection_method= None)
model.fit(x_train_3m, y_train_3m)

AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,
                               drift_detection_method=None, grace_period=50,
                               lambda_value=6, leaf_prediction='nba',
                               max_byte_size=33554432, max_features=55,
                               memory_estimate_period=2000000, n_estimators=10,
                               nb_threshold=0, no_preprune=False,
                               nominal_attributes=None,
                               performance_metric='acc', random_state=None,
                               remove_poor_atts=False, split_confidence=0.01,
                               split_criterion='info_gain',
                               stop_mem_management=False, tie_threshold=0.05,

In [None]:
# model tests per month
def testMonthly(df, y_vals) : 

  testMonths = pd.to_datetime(['06-19', '07-19', '08-19', '09-19', '10-19','11-19','12-19','01-20', '02-20','03-20','04-20','05-20','06-20'],
                            format='%m-%y')

  # df.sort_values(by='Date')
  output = pd.DataFrame()
  for i in range(len(testMonths)):
    monthMask = (df.Date.dt.month == testMonths[i].month) & (df.Date.dt.year == testMonths[i].year)
    y_test = y_vals[monthMask]
    y_pred = df.y_pred[monthMask]
    scores = precision_recall_fscore_support(y_test, y_pred, average="macro")
    roc_score = roc_auc_score(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred) 

    resultDF = pd.DataFrame({'ROC-AUC': [roc_score], 'Precision': [scores[0]],
                           'Recall': [scores[1]], 'F1-Score': [scores[2]],
                           'Accuracy': accuracy})
    output = output.append(resultDF)
  # print('Results for {}: \n'.format(testMonths[i].strftime("%b %y")), results)
  output.index= testMonths
  return output

In [None]:
results = timeSampledDF[test]
results['y_pred'] = fullPred
testMonthly(results, results.Abgelehnt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ROC-AUC,Precision,Recall,F1-Score,Accuracy
2019-06-01,0.488625,0.475401,0.488625,0.409229,0.488625
2019-07-01,0.48623,0.483535,0.48623,0.464311,0.48623
2019-08-01,0.475155,0.442413,0.475155,0.388191,0.475155
2019-09-01,0.490345,0.475412,0.490345,0.399111,0.490345
2019-10-01,0.486486,0.415829,0.486486,0.350096,0.486486
2019-11-01,0.51348,0.51546,0.51348,0.49739,0.51348
2019-12-01,0.518092,0.520143,0.518092,0.505506,0.518092
2020-01-01,0.509053,0.511144,0.509053,0.484892,0.509053
2020-02-01,0.540816,0.559442,0.540816,0.501789,0.540816
2020-03-01,0.547112,0.607967,0.547112,0.472829,0.547112


Test 100


In [None]:
artifact = Artifact(training_data, model, 100)
test_data = timeSampledDF[test].reset_index(drop=True)
fullPred = []

for i in range(len(test_data)):
  fullPred.append(artifact.predict(test_data.loc[i, 'Text']))
  artifact.add_data(test_data.loc[i, 'Text'], test_data.loc[i, 'Abgelehnt'])

results = timeSampledDF[test]
results['y_pred'] = fullPred
testMonthly(results, results.Abgelehnt)

Refitting a batch of size  100
New General Corpus set. Inner PWI at:  0.16446290070544023
New General Corpus set. Inner PWI at:  0.07796179820764992
Refitting a batch of size  261
New General Corpus set. Inner PWI at:  0.11613262907546207
Refitting a batch of size  100
New General Corpus set. Inner PWI at:  0.22680285529630628
New General Corpus set. Inner PWI at:  0.17268482687925504
New General Corpus set. Inner PWI at:  0.16261169536146164
New General Corpus set. Inner PWI at:  0.13949853722235087
Refitting a batch of size  100
New General Corpus set. Inner PWI at:  0.25167102633815236
New General Corpus set. Inner PWI at:  0.1730536101973259
New General Corpus set. Inner PWI at:  0.16967672122569438
New General Corpus set. Inner PWI at:  0.16646316941934033
New General Corpus set. Inner PWI at:  0.14281932432856845
New General Corpus set. Inner PWI at:  0.11013645838938681
Refitting a batch of size  100
New General Corpus set. Inner PWI at:  0.370591692124549
New General Corpus set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,ROC-AUC,Precision,Recall,F1-Score,Accuracy
2019-06-01,0.483454,0.452413,0.483454,0.382804,0.483454
2019-07-01,0.478158,0.456958,0.478158,0.404879,0.478158
2019-08-01,0.495652,0.474215,0.495652,0.363321,0.495652
2019-09-01,0.514483,0.514488,0.514483,0.514437,0.514483
2019-10-01,0.449324,0.446605,0.449324,0.442222,0.449324
2019-11-01,0.530025,0.549646,0.530025,0.478495,0.530025
2019-12-01,0.508224,0.528824,0.508224,0.401242,0.508224
2020-01-01,0.495822,0.48258,0.495822,0.377529,0.495822
2020-02-01,0.48105,0.455911,0.48105,0.394778,0.48105
2020-03-01,0.515957,0.535642,0.515957,0.438419,0.515957


Test 500

In [None]:
artifact = Artifact(training_data, model, 500)
test_data = timeSampledDF[test].reset_index(drop=True)
fullPred = []

for i in range(len(test_data)):
  fullPred.append(artifact.predict(test_data.loc[i, 'Text']))
  artifact.add_data(test_data.loc[i, 'Text'], test_data.loc[i, 'Abgelehnt'])

results = timeSampledDF[test]
results['y_pred'] = fullPred
testMonthly(results, results.Abgelehnt)

Refitting a batch of size  500
New General Corpus set. Inner PWI at:  0.2625330167909137
New General Corpus set. Inner PWI at:  0.23384615828770883
Refitting a batch of size  500
New General Corpus set. Inner PWI at:  0.2941441802199233
New General Corpus set. Inner PWI at:  0.259925050577095
New General Corpus set. Inner PWI at:  0.23203034329824304
Refitting a batch of size  500
New General Corpus set. Inner PWI at:  0.2894565972310512
New General Corpus set. Inner PWI at:  0.28401492661873623
New General Corpus set. Inner PWI at:  0.27756356937885074
New General Corpus set. Inner PWI at:  0.24248301579374207
New General Corpus set. Inner PWI at:  0.23142162637261451
Refitting a batch of size  500
New General Corpus set. Inner PWI at:  0.3311829422049995
New General Corpus set. Inner PWI at:  0.2635698865940111
New General Corpus set. Inner PWI at:  0.2632800448911131
New General Corpus set. Inner PWI at:  0.2611828730212823
New General Corpus set. Inner PWI at:  0.24530432511741745


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,ROC-AUC,Precision,Recall,F1-Score,Accuracy
2019-06-01,0.457084,0.456413,0.457084,0.454986,0.457084
2019-07-01,0.468661,0.419466,0.468661,0.372892,0.468661
2019-08-01,0.497516,0.497456,0.497516,0.494566,0.497516
2019-09-01,0.484138,0.481166,0.484138,0.462951,0.484138
2019-10-01,0.464189,0.462918,0.464189,0.459558,0.464189
2019-11-01,0.514706,0.516454,0.514706,0.501462,0.514706
2019-12-01,0.510691,0.51074,0.510691,0.510134,0.510691
2020-01-01,0.483983,0.483466,0.483983,0.479915,0.483983
2020-02-01,0.478134,0.476703,0.478134,0.469997,0.478134
2020-03-01,0.449088,0.447758,0.449088,0.445559,0.449088


Test 1000

In [None]:
artifact = Artifact(training_data, model, 1000)
test_data = timeSampledDF[test].reset_index(drop=True)
fullPred = []

for i in range(len(test_data)):
  fullPred.append(artifact.predict(test_data.loc[i, 'Text']))
  artifact.add_data(test_data.loc[i, 'Text'], test_data.loc[i, 'Abgelehnt'])

results = timeSampledDF[test]
results['y_pred'] = fullPred
testMonthly(results, results.Abgelehnt)

Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.3379944452982139
New General Corpus set. Inner PWI at:  0.30337354094086505
Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.3588321961383542
New General Corpus set. Inner PWI at:  0.31796845670202906
New General Corpus set. Inner PWI at:  0.28366835644080957
Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.3650209503184012
New General Corpus set. Inner PWI at:  0.36355649587766364
New General Corpus set. Inner PWI at:  0.30328335288270286
New General Corpus set. Inner PWI at:  0.2863871869603261
Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.2998526940867459
Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.3045299741803459
New General Corpus set. Inner PWI at:  0.3013323434885485
Refitting a batch of size  1000
New General Corpus set. Inner PWI at:  0.287017415938112
Refitting a batch of size  1000
New General Corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,ROC-AUC,Precision,Recall,F1-Score,Accuracy
2019-06-01,0.502068,0.514051,0.502068,0.367142,0.502068
2019-07-01,0.502374,0.514357,0.502374,0.371163,0.502374
2019-08-01,0.499379,0.499228,0.499379,0.473644,0.499379
2019-09-01,0.505517,0.508168,0.505517,0.46186,0.505517
2019-10-01,0.495946,0.477508,0.495946,0.366018,0.495946
2019-11-01,0.476716,0.476702,0.476716,0.476637,0.476716
2019-12-01,0.505757,0.532212,0.505757,0.378058,0.505757
2020-01-01,0.485376,0.47956,0.485376,0.445964,0.485376
2020-02-01,0.507289,0.511794,0.507289,0.455264,0.507289
2020-03-01,0.531915,0.538262,0.531915,0.511662,0.531915
