## Preparation of real world data

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import math
from scipy.io import arff

In [3]:
def target_encoder(df, column, target, index=None):
    index = df.index if index is None else index # Encode the entire dataframe if no specific indices are supplied
    encoded_column = df[column].map(df.iloc[index].groupby(column)[target].mean())
  
    return encoded_column

def random_sample_data(x,y, percentage):
    df = pd.DataFrame(x)
    df['y'] = y
    df_elements = df.sample(frac = percentage, random_state=42)  
    
    return df_elements

### Prepare Insects abrupt data

- Drifts at: 14.352; 19.500; 33.240; 38.682; 39.510

Souza, V. M. A., dos Reis, D. M., Maletzke, A. G., & Batista, G. E. A. P. A. (2020). Challenges in benchmarking stream learning algorithms with real-world data. Data Mining and Knowledge Discovery, 34(6), 1805–1858. https://doi.org/10.1007/s10618-020-00698-5

In [50]:
data = arff.loadarff("../Data_Pool/paper_data/INSECTS-abrupt_balanced_norm.arff")
data = pd.DataFrame(data[0])
data.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,class
0,0.507066,0.153333,0.226092,0.302447,0.007239,0.36912,0.332436,0.017807,0.032819,0.033009,...,0.006855,0.017358,0.01343,0.009138,0.006768,0.007291,0.009224,0.036218,0.162955,b'ae-albopictus-female'
1,0.281661,0.355953,0.253196,0.340335,0.415631,0.503923,0.392029,0.003648,0.068381,0.011155,...,0.005631,0.014048,0.002431,0.007076,0.037682,0.003089,0.004207,0.004144,0.005044,b'ae-albopictus-female'
2,0.19375,0.257782,0.183339,0.247017,0.302133,0.363522,0.269729,0.293543,0.293002,0.029522,...,0.023837,0.013922,0.081406,0.413674,0.295615,0.120392,0.036566,0.032652,0.025776,b'cx-quinq-female'
3,0.514782,0.154867,0.016903,0.226084,0.297642,0.239111,0.248268,0.066745,0.11502,0.083407,...,0.020949,0.023019,0.021147,0.020813,0.019048,0.011606,0.013379,0.044839,0.123552,b'ae-albopictus-female'
4,0.774337,0.012549,0.105751,0.033302,0.01717,0.049754,0.1735,0.05522,0.044184,0.034923,...,0.034876,0.060708,0.048119,0.027417,0.015022,0.010218,0.008121,0.012539,0.018058,b'ae-aegypti-male'


In [51]:
# encode and rename label column
le = LabelEncoder()
data['class'] = le.fit_transform(data['class'])
data = data.rename(columns={"class": "label"})

In [52]:
# sample 10% of initial training set for later retraining purposes
initial_batch_size = math.trunc(0.05*len(data))
x = data.iloc[:initial_batch_size,:-1].values
y = data.iloc[:initial_batch_size,-1].values

data_sample = random_sample_data(x, y, 0.1)
data_sample.columns = data.columns

In [27]:
# create training set (5% size of whole dataset)
data_train = data.iloc[:math.trunc(0.05*len(data)),:]

In [28]:
# create validation set (10% size of whole dataset, containing one drift from test set). Size of validation set is therfore: = 5284
# choose second drift to be in the middle of the validation set. Drift within validation set at 2647
data_val = data[16858:22142]
data_train_and_validate = data_train.append(data_val)

In [29]:
#data.to_csv("./Data_prep/insects_abrupt_train_test.csv", index = False)
#data_train_and_validate.to_csv("./Data_prep/insects_abrupt_train_val.csv", index = False)
#data_sample.to_csv("./Data_prep/insects_abrupt_10_sample.csv", index = False)

### Prepare Insects abrupt data (with feature reduction)

In [42]:
data = pd.read_csv("./Data_prep/insects_abrupt_train_test.csv", encoding = 'cp1252')
data_train_and_validate = pd.read_csv("./Data_prep/insects_abrupt_train_val.csv", encoding = 'cp1252')
data_sample = pd.read_csv("./Data_prep/insects_abrupt_10_sample.csv", encoding = 'cp1252')

# drop columns determined by Shapley Contributions in Feature_Reduction.ipynb
data.drop(columns=['Att5','Att6','Att8','Att10','Att11','Att12','Att13','Att14','Att15','Att25','Att31','Att32','Att33'], inplace=True)
data_train_and_validate.drop(columns=['Att5','Att6','Att8','Att10','Att11','Att12','Att13','Att14','Att15','Att25','Att31','Att32','Att33'], inplace=True)
data_sample.drop(columns=['Att5','Att6','Att8','Att10','Att11','Att12','Att13','Att14','Att15','Att25','Att31','Att32','Att33'], inplace=True)

#data.to_csv("./Data_prep/insects_abrupt_train_test_red.csv", index = False)
#data_train_and_validate.to_csv("./Data_prep/insects_abrupt_train_val_red.csv", index = False)
#data_sample.to_csv("./Data_prep/insects_abrupt_10_sample_red.csv", index = False)

### Prepare Insects incremental-abrupt-reoccurring data (balanced)

- Drifts at: 26.568; 53.364

Souza, V. M. A., dos Reis, D. M., Maletzke, A. G., & Batista, G. E. A. P. A. (2020). Challenges in benchmarking stream learning algorithms with real-world data. Data Mining and Knowledge Discovery, 34(6), 1805–1858. https://doi.org/10.1007/s10618-020-00698-5

In [18]:
data = arff.loadarff("../Data_Pool/paper_data/INSECTS-incremental-abrupt_balanced_norm.arff")
data = pd.DataFrame(data[0])

In [19]:
# encode and rename label column
le = LabelEncoder()
data['class'] = le.fit_transform(data['class'])
data = data.rename(columns={"class": "label"})
data.describe()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Att25,Att26,Att27,Att28,Att29,Att30,Att31,Att32,Att33,label
count,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,...,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0
mean,0.279467,0.248287,0.187459,0.221078,0.245371,0.309888,0.313936,0.048897,0.061732,0.054235,...,0.0461,0.03293,0.035914,0.044495,0.030234,0.046209,0.033957,0.038701,0.045209,2.5
std,0.129473,0.130165,0.104216,0.122271,0.158116,0.178702,0.113951,0.0519,0.055487,0.054829,...,0.067432,0.047196,0.0505,0.063509,0.044491,0.070968,0.054721,0.061966,0.069333,1.707836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.181397,0.141399,0.148862,0.170729,0.06093,0.23396,0.232507,0.01393,0.022512,0.016756,...,0.009764,0.006439,0.007165,0.008902,0.006229,0.010307,0.00767,0.00967,0.011587,1.0
50%,0.266227,0.235301,0.198291,0.233794,0.277366,0.335274,0.288813,0.03239,0.047512,0.03788,...,0.019807,0.013495,0.014932,0.018254,0.012198,0.018496,0.013458,0.016119,0.019405,2.5
75%,0.344016,0.338492,0.252668,0.29453,0.346366,0.415892,0.370799,0.065146,0.082929,0.072373,...,0.049517,0.037112,0.041653,0.050271,0.032868,0.045663,0.03153,0.03587,0.043744,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


In [20]:
# sample 10% of initial training set for later retraining purposes
initial_batch_size = math.trunc(0.05*len(data))
x = data.iloc[:initial_batch_size,:-1].values
y = data.iloc[:initial_batch_size,-1].values

data_sample = random_sample_data(x, y, 0.1)
data_sample.columns = data.columns

In [21]:
# create training set (5% size of whole dataset)
data_train = data.iloc[:math.trunc(0.05*len(data)),:]

In [22]:
# create validation set (10% size of whole dataset, containing one drift from test set)
# choose second drift to be in the middle of the validation set. Drift in validation set at 
data_val = data[22000:30000] #drift in validation set at 4568
data_train_and_validate = data_train.append(data_val)

In [23]:
#data.to_csv("./Data_prep/insects_inc_abrupt_train_test.csv", index = False)
#data_train_and_validate.to_csv("./Data_prep/insects_inc_abrupt_train_val.csv", index = False)
#data_sample.to_csv("./Data_prep/insects_inc_abrupt_10_sample.csv", index = False)

### Prepare Insects incremental-abrupt-reoccurring data (balanced, with feature reduction) 

- Drifts at: 26.568; 53.364

Souza, V. M. A., dos Reis, D. M., Maletzke, A. G., & Batista, G. E. A. P. A. (2020). Challenges in benchmarking stream learning algorithms with real-world data. Data Mining and Knowledge Discovery, 34(6), 1805–1858. https://doi.org/10.1007/s10618-020-00698-5

In [24]:
data = pd.read_csv("./Data_prep/insects_inc_abrupt_train_test.csv", encoding = 'cp1252')
data_train_and_validate = pd.read_csv("./Data_prep/insects_inc_abrupt_train_val.csv", encoding = 'cp1252')
data_sample = pd.read_csv("./Data_prep/insects_inc_abrupt_10_sample.csv", encoding = 'cp1252')

data.drop(columns=['Att5','Att9','Att10','Att11','Att12','Att13','Att14','Att19','Att23','Att24','Att29','Att31','Att33'], inplace=True)
data_train_and_validate.drop(columns=['Att5','Att9','Att10','Att11','Att12','Att13','Att14','Att19','Att23','Att24','Att29','Att31','Att33'], inplace=True)
data_sample.drop(columns=['Att5','Att9','Att10','Att11','Att12','Att13','Att14','Att19','Att23','Att24','Att29','Att31','Att33'], inplace=True)

In [25]:
#data.to_csv("./Data_prep/insects_inc_abrupt_train_test_red.csv", index = False)
#data_train_and_validate.to_csv("./Data_prep/insects_inc_abrupt_train_val_red.csv", index = False)
#data_sample.to_csv("./Data_prep/insects_inc_abrupt_10_sample_red.csv", index = False)