# UCDD on spam evaluation

## Accept and preprocess the spam dataset

In [3]:
import numpy as np
import pandas as pd

In [12]:
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv("../Datasets_concept_drift/real_world_data/spam_dataset.csv")

print('df columns')
print(df.columns)

X = df.drop(columns=['Unnamed: 0', 'ACTUAL_LABEL'])
y = df["ACTUAL_LABEL"]

X_ref = X[:1468]
X_test = X[1468:]
y_ref = y[:1468]
y_test = y[1468:]

X_ref = X_ref.to_numpy()
X_test = X_test.to_numpy()
y_ref = y_ref.to_numpy().reshape((len(y_ref.index), 1))
y_test = y_test.to_numpy().reshape((len(y_test.index), 1))

print('\nreference data')
print(X_ref.shape)
print(X_ref)
print('\nreference labels')
print(y_ref.shape)
print('\ntesting data')
print(X_test.shape)
print(X_test)
print('\ntesting labels')
print(y_test.shape)

df columns
Index(['Unnamed: 0', 'aa', 'aba', 'abandon', 'abb', 'abbey', 'abbrevi',
       'abdomin', 'abduct', 'aberr',
       ...
       'zig', 'zimbabw', 'zinc', 'zip', 'zodiac', 'zone', 'zoo', 'zoom',
       'zowi', 'ACTUAL_LABEL'],
      dtype='object', length=10729)
columns with all same values
Index(['aba', 'abbey', 'abdomin', 'aberr', 'abid', 'abolish', 'abolit',
       'abort', 'abridg', 'absent',
       ...
       'yoga', 'yon', 'yorker', 'yuck', 'za', 'zed', 'zee', 'zodiac', 'zoo',
       'zowi'],
      dtype='object', length=2728)

reference data
(1468, 10727)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

reference labels
(1468, 1)

testing data
(2937, 10727)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

testing labels
(2937, 1)


    ## Split the dataset to batches

### Size=100

In [5]:
from eval_helpers import helpers

X_test_batches_100, y_test_batches_100 = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=100)
X_ref_batches_100, y_ref_batches_100 = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=100)

print('# test batches 100:')
print(len(X_test_batches_100))
print(len(y_test_batches_100))
print('# ref batches 100:')
print(len(X_ref_batches_100))
print(len(y_ref_batches_100))

chunk size 100
number of chunks 29
number of data 2937
shape of X that can be split perfectly (2900, 10727)
number of resulting batches 29
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(100, 10727)
chunk size 100
number of chunks 14
number of data 1468
shape of X that can be split perfectly (1400, 10727)
number of resulting batches 14
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(100, 10727)
# test batches 100:
29
29
# ref batches 100:
14
14


### Size=50

In [6]:
from eval_helpers import helpers

X_test_batches_50, y_test_batches_50 = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=50)
X_ref_batches_50, y_ref_batches_50 = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=50)

print('# test batches 50:')
print(len(X_test_batches_50))
print(len(y_test_batches_50))
print('# ref batches 50:')
print(len(X_ref_batches_50))
print(len(y_ref_batches_50))

chunk size 50
number of chunks 58
number of data 2937
shape of X that can be split perfectly (2900, 10727)
number of resulting batches 58
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(50, 10727)
chunk size 50
number of chunks 29
number of data 1468
shape of X that can be split perfectly (1450, 10727)
number of resulting batches 29
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(50, 10727)
# test batches 50:
58
58
# ref batches 50:
29
29


### Size=20

In [7]:
from eval_helpers import helpers

X_test_batches_20, y_test_batches_20 = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=20)
X_ref_batches_20, y_ref_batches_20 = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=20)

print('# test batches 20:')
print(len(X_test_batches_20))
print(len(y_test_batches_20))
print('# ref batches 20:')
print(len(X_ref_batches_20))
print(len(y_ref_batches_20))


chunk size 20
number of chunks 146
number of data 2937
shape of X that can be split perfectly (2920, 10727)
number of resulting batches 146
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(20, 10727)
chunk size 20
number of chunks 73
number of data 1468
shape of X that can be split perfectly (1460, 10727)
number of resulting batches 73
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(20, 10727)
# test batches 20:
146
146
# ref batches 20:
73
73


## Evaluation with size 100 batches

In [9]:
from core import mssw_preprocessing
from eval_helpers import kmeans_verbose_helpers

weighted_joined_reference_data, _, _ = mssw_preprocessing.mssw_preprocess(X_ref_batches_100, X_test_batches_100)

something


  return np.divide(reference_data, column_sum)
