In [1]:
import pandas as pd
import numpy as np

In [3]:
df_ADNR_A01 = pd.read_csv('../data_ADNR/with0.1anomaly/allocatedJobs.csv')
df_ADNR_M01 = pd.read_csv('../data_ADNR/with0.1anomaly/destroyedJobs.csv')

In [4]:
def has_high_resource(row):
    # Check if any resource requirement is above 10
    return np.any(np.array(row['job ressource requirement']) > 10)

In [6]:
def convert_to_list(row):
    # Remove brackets and split on space
    items = row.replace('[', '').replace(']', '').split(' ')
    # Remove any empty strings caused by extra spaces
    items = [item for item in items if item != '']
    # Convert strings to integers
    items = [int(item) for item in items]
    return items

In [5]:
def has_high_length(row):
    return row['job Length'] >= 23

In [7]:
df_ADNR_A01['job ressource requirement'] = df_ADNR_A01['job ressource requirement'].apply(convert_to_list)
df_ADNR_M01['job ressource requirement'] = df_ADNR_M01['job ressource requirement'].apply(convert_to_list)

In [8]:
df_ADNR_A01['high_resource'] = df_ADNR_A01.apply(has_high_resource, axis=1)
df_ADNR_M01['high_resource'] = df_ADNR_M01.apply(has_high_resource, axis=1)

In [9]:
# Apply the function to both dataframes
df_ADNR_A01['high_length'] = df_ADNR_A01.apply(has_high_length, axis=1)
df_ADNR_M01['high_length'] = df_ADNR_M01.apply(has_high_length, axis=1)

In [10]:
# Create a new column that is True when either high_length or high_resource is True
df_ADNR_A01['high_length_or_resource'] = df_ADNR_A01['high_length'] | df_ADNR_A01['high_resource']
df_ADNR_M01['high_length_or_resource'] = df_ADNR_M01['high_length'] | df_ADNR_M01['high_resource']

In [11]:
allocated_high_length_or_resource = df_ADNR_A01['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_ADNR_M01['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

print("Proportion of jobs with either high length or high resource that were destroyed:", proportion_destroyed_length_or_resource)

('Proportion of jobs with either high length or high resource that were destroyed:', 0.7463471654003506)


In [14]:
TP = len(df_ADNR_M01[(df_ADNR_M01['high_resource'] == True) | (df_ADNR_M01['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_ADNR_M01[(df_ADNR_M01['high_resource'] == False) & (df_ADNR_M01['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

0.31045380875202594

In [15]:
f1_score = 2 * (precision * proportion_destroyed_length_or_resource) / (precision+proportion_destroyed_length_or_resource)
f1_score

0.43850512218851945

In [17]:
df_ADNR_A03 = pd.read_csv('../data_ADNR/with0.3anomaly/allocatedJobs.csv')
df_ADNR_M03 = pd.read_csv('../data_ADNR/with0.3anomaly/destroyedJobs.csv')

In [18]:
df_ADNR_A03['job ressource requirement'] = df_ADNR_A03['job ressource requirement'].apply(convert_to_list)
df_ADNR_M03['job ressource requirement'] = df_ADNR_M03['job ressource requirement'].apply(convert_to_list)

df_ADNR_A03['high_resource'] = df_ADNR_A03.apply(has_high_resource, axis=1)
df_ADNR_M03['high_resource'] = df_ADNR_M03.apply(has_high_resource, axis=1)

df_ADNR_A03['high_length'] = df_ADNR_A03.apply(has_high_length, axis=1)
df_ADNR_M03['high_length'] = df_ADNR_M03.apply(has_high_length, axis=1)

df_ADNR_A03['high_length_or_resource'] = df_ADNR_A03['high_length'] | df_ADNR_A03['high_resource']
df_ADNR_M03['high_length_or_resource'] = df_ADNR_M03['high_length'] | df_ADNR_M03['high_resource']

allocated_high_length_or_resource = df_ADNR_A03['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_ADNR_M03['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

TN = len(df_ADNR_A03[(df_ADNR_A03['high_resource'] == False) & (df_ADNR_A03['high_length'] == False)])

# False negatives (anomalous jobs incorrectly identified as non-anomalies)
FN = len(df_ADNR_A03[(df_ADNR_A03['high_resource'] == True) | (df_ADNR_A03['high_length'] == True)])

# Recall calculation
recall = float(TP) / (TP + FN)
recall

TP = len(df_ADNR_M03[(df_ADNR_M03['high_resource'] == True) | (df_ADNR_M03['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_ADNR_M03[(df_ADNR_M03['high_resource'] == False) & (df_ADNR_M03['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

f1_score = 2 * (precision * recall) / (precision+recall)
f1_score

print(proportion_destroyed_length_or_resource, precision, recall, f1_score)

(0.8489798512680353, 0.6895003097253768, 0.6172063798936684, 0.6513535034021003)


In [19]:
df_ADNR_A05 = pd.read_csv('../data_ADNR/with0.5anomaly/allocatedJobs.csv')
df_ADNR_M05 = pd.read_csv('../data_ADNR/with0.5anomaly/destroyedJobs.csv')

In [20]:
df_ADNR_A05['job ressource requirement'] = df_ADNR_A05['job ressource requirement'].apply(convert_to_list)
df_ADNR_M05['job ressource requirement'] = df_ADNR_M05['job ressource requirement'].apply(convert_to_list)

df_ADNR_A05['high_resource'] = df_ADNR_A05.apply(has_high_resource, axis=1)
df_ADNR_M05['high_resource'] = df_ADNR_M05.apply(has_high_resource, axis=1)

df_ADNR_A05['high_length'] = df_ADNR_A05.apply(has_high_length, axis=1)
df_ADNR_M05['high_length'] = df_ADNR_M05.apply(has_high_length, axis=1)

df_ADNR_A05['high_length_or_resource'] = df_ADNR_A05['high_length'] | df_ADNR_A05['high_resource']
df_ADNR_M05['high_length_or_resource'] = df_ADNR_M05['high_length'] | df_ADNR_M05['high_resource']

allocated_high_length_or_resource = df_ADNR_A05['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_ADNR_M05['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

TN = len(df_ADNR_A05[(df_ADNR_A05['high_resource'] == False) & (df_ADNR_A05['high_length'] == False)])

# False negatives (anomalous jobs incorrectly identified as non-anomalies)
FN = len(df_ADNR_A05[(df_ADNR_A05['high_resource'] == True) | (df_ADNR_A05['high_length'] == True)])

# Recall calculation
recall = float(TP) / (TP + FN)
recall

TP = len(df_ADNR_M05[(df_ADNR_M05['high_resource'] == True) | (df_ADNR_M05['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_ADNR_M05[(df_ADNR_M05['high_resource'] == False) & (df_ADNR_M05['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

f1_score = 2 * (precision * recall) / (precision+recall)
f1_score

print(proportion_destroyed_length_or_resource, precision, recall, f1_score)

(0.9361011511605963, 0.9357656247052387, 0.7977662306635609, 0.8612731434886867)


In [21]:
df_AD_A01 = pd.read_csv('../data_AD/anomalyrate0.1/allocatedJobs.csv')
df_AD_M01 = pd.read_csv('../data_AD/anomalyrate0.1/destroyedJobs.csv')

In [22]:
df_AD_A01['job ressource requirement'] = df_AD_A01['job ressource requirement'].apply(convert_to_list)
df_AD_M01['job ressource requirement'] = df_AD_M01['job ressource requirement'].apply(convert_to_list)

df_AD_A01['high_resource'] = df_AD_A01.apply(has_high_resource, axis=1)
df_AD_M01['high_resource'] = df_AD_M01.apply(has_high_resource, axis=1)

df_AD_A01['high_length'] = df_AD_A01.apply(has_high_length, axis=1)
df_AD_M01['high_length'] = df_AD_M01.apply(has_high_length, axis=1)

df_AD_A01['high_length_or_resource'] = df_AD_A01['high_length'] | df_AD_A01['high_resource']
df_AD_M01['high_length_or_resource'] = df_AD_M01['high_length'] | df_AD_M01['high_resource']

allocated_high_length_or_resource = df_AD_A01['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_AD_M01['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

TN = len(df_AD_A01[(df_AD_A01['high_resource'] == False) & (df_AD_A01['high_length'] == False)])

# False negatives (anomalous jobs incorrectly identified as non-anomalies)
FN = len(df_AD_A01[(df_AD_A01['high_resource'] == True) | (df_AD_A01['high_length'] == True)])

# Recall calculation
recall = float(TP) / (TP + FN)
recall

TP = len(df_AD_M01[(df_AD_M01['high_resource'] == True) | (df_AD_M01['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_AD_M01[(df_AD_M01['high_resource'] == False) & (df_AD_M01['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

f1_score = 2 * (precision * recall) / (precision+recall)
f1_score

print(proportion_destroyed_length_or_resource, precision, recall, f1_score)

(0.7760620410404228, 0.2877060858945654, 0.9382790776855127, 0.4403782507928032)


In [23]:
df_AD_A03 = pd.read_csv('../data_AD/anomalyrate0.3/allocatedJobs.csv')
df_AD_M03 = pd.read_csv('../data_AD/anomalyrate0.3/destroyedJobs.csv')

In [24]:
df_AD_A03['job ressource requirement'] = df_AD_A03['job ressource requirement'].apply(convert_to_list)
df_AD_M03['job ressource requirement'] = df_AD_M03['job ressource requirement'].apply(convert_to_list)

df_AD_A03['high_resource'] = df_AD_A03.apply(has_high_resource, axis=1)
df_AD_M03['high_resource'] = df_AD_M03.apply(has_high_resource, axis=1)

df_AD_A03['high_length'] = df_AD_A03.apply(has_high_length, axis=1)
df_AD_M03['high_length'] = df_AD_M03.apply(has_high_length, axis=1)

df_AD_A03['high_length_or_resource'] = df_AD_A03['high_length'] | df_AD_A03['high_resource']
df_AD_M03['high_length_or_resource'] = df_AD_M03['high_length'] | df_AD_M03['high_resource']

allocated_high_length_or_resource = df_AD_A03['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_AD_M03['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

TN = len(df_AD_A03[(df_AD_A03['high_resource'] == False) & (df_AD_A03['high_length'] == False)])

# False negatives (anomalous jobs incorrectly identified as non-anomalies)
FN = len(df_AD_A03[(df_AD_A03['high_resource'] == True) | (df_AD_A03['high_length'] == True)])

# Recall calculation
recall = float(TP) / (TP + FN)
recall

TP = len(df_AD_M03[(df_AD_M03['high_resource'] == True) | (df_AD_M03['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_AD_M03[(df_AD_M03['high_resource'] == False) & (df_AD_M03['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

f1_score = 2 * (precision * recall) / (precision+recall)
f1_score

print(proportion_destroyed_length_or_resource, precision, recall, f1_score)

(0.8814262805395717, 0.7624228248157315, 0.7499171032561841, 0.7561182583748092)


In [25]:
df_AD_A05 = pd.read_csv('../data_AD/anomalyrate0.5/allocatedJobs.csv')
df_AD_M05 = pd.read_csv('../data_AD/anomalyrate0.5/destroyedJobs.csv')

In [26]:
df_AD_A05['job ressource requirement'] = df_AD_A05['job ressource requirement'].apply(convert_to_list)
df_AD_M05['job ressource requirement'] = df_AD_M05['job ressource requirement'].apply(convert_to_list)

df_AD_A05['high_resource'] = df_AD_A05.apply(has_high_resource, axis=1)
df_AD_M05['high_resource'] = df_AD_M05.apply(has_high_resource, axis=1)

df_AD_A05['high_length'] = df_AD_A05.apply(has_high_length, axis=1)
df_AD_M05['high_length'] = df_AD_M05.apply(has_high_length, axis=1)

df_AD_A05['high_length_or_resource'] = df_AD_A05['high_length'] | df_AD_A05['high_resource']
df_AD_M05['high_length_or_resource'] = df_AD_M05['high_length'] | df_AD_M05['high_resource']

allocated_high_length_or_resource = df_AD_A05['high_length_or_resource'].sum()
destroyed_high_length_or_resource = df_AD_M05['high_length_or_resource'].sum()

# Calculate the total number of jobs that satisfy either condition
total_high_length_or_resource = allocated_high_length_or_resource + destroyed_high_length_or_resource

# Calculate the proportion of these jobs that were destroyed
proportion_destroyed_length_or_resource = float(destroyed_high_length_or_resource) / total_high_length_or_resource

TN = len(df_AD_A05[(df_AD_A05['high_resource'] == False) & (df_AD_A05['high_length'] == False)])

# False negatives (anomalous jobs incorrectly identified as non-anomalies)
FN = len(df_AD_A05[(df_AD_A05['high_resource'] == True) | (df_AD_A05['high_length'] == True)])

# Recall calculation
recall = float(TP) / (TP + FN)
recall

TP = len(df_AD_M05[(df_AD_M05['high_resource'] == True) | (df_AD_M05['high_length'] == True)])

# False positives (non-anomalous jobs incorrectly identified as anomalies)
FP = len(df_AD_M05[(df_AD_M05['high_resource'] == False) & (df_AD_M05['high_length'] == False)])

# Precision calculation
precision = float(TP) / (TP + FP)
precision

f1_score = 2 * (precision * recall) / (precision+recall)
f1_score

print(proportion_destroyed_length_or_resource, precision, recall, f1_score)

(0.9440150765247849, 0.9415953063588205, 0.9050463306751041, 0.9229591274349627)
