# Experimental Setup of the paper
## 'Class Prior Estimation in Active Positive and Unlabeled Learning', IJCAI 2020

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

from scipy.io import arff
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random, math
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KernelDensity
from multiprocessing import Pool, freeze_support, cpu_count
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import collections
from anomatools.models import SSDO
from sklearn.ensemble import IsolationForest
from TIcE import *
from Kernel_MPE_grad_threshold import *
from evaluate_CAPe import *
from CAPe import *

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

#### In order to reproduce the experiments in the paper, for each of the benchmark dataset, run the following code.

# WBC_norm_v02

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/WBC_norm_v02.arff') #set your directory to upload the dataset
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:9]].values             #entire data set

tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'WBC'                              #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Shuttle_withoutdupl_norm_v02

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Shuttle_withoutdupl_norm_v02.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:9]].values             #entire data set

skf2 = StratifiedKFold(n_splits=2, random_state=331, shuffle=True) #used to reduce the size...
for _ , index in skf2.split(data, y):
    data = data[index]
    y = y[index]
    break;
    
tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Shuttle'                          #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Annthyroid_withoutdupl_norm_07

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Annthyroid_withoutdupl_norm_07.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:21]].values             #entire data set

skf2 = StratifiedKFold(n_splits=10, random_state=331, shuffle=True) #used to reduce the size...
for _ , index in skf2.split(data, y):
    data = data[index]
    y = y[index]
    break;
    
tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Annthyroid'                       #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# WDBC_withoutdupl_norm_v02

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/WDBC_withoutdupl_norm_v02.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:30]].values            #entire data set

tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'WDBC'                             #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Stamps_withoutdupl_norm_09

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Stamps_withoutdupl_norm_09.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:9]].values             #entire data set

tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Stamps'                           #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Cardiotocography_withoutdupl_norm_05_v02

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Cardiotocography_withoutdupl_norm_05_v02.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:21]].values             #entire data set

skf2 = StratifiedKFold(n_splits=4, random_state=331, shuffle=True) #used to reduce the size...
for _ , index in skf2.split(data, y):
    data = data[index]
    y = y[index]
    break;
    
tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Cardiotocography'                 #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Ionosphere_withoutdupl_norm

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Ionosphere_withoutdupl_norm.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:32]].values            #entire data set

tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Ionosphere'                       #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# Pima_withoutdupl_norm_20_v02

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/Pima_withoutdupl_norm_20_v02.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:8]].values             #entire data set

tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'Pima'                             #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)

# PageBlocks_norm_10

In [None]:
np.random.seed(331)
data = arff.loadarff('../files/csvfiles/Datasets/PageBlocks_norm_10.arff')
df = pd.DataFrame(data[0])
df['outlier'] = [string.decode("utf-8") for string in df['outlier'].values]
y = np.array([1 if string == 'yes' else 0 for string in df['outlier'].values], dtype = int)
real_cont = sum(y)/len(y)
class_prior = 1-real_cont
data = df[df.columns[:10]].values            #entire data set

skf2 = StratifiedKFold(n_splits=13, random_state=331, shuffle=True) #used to reduce the size...
for _ , index in skf2.split(data, y):
    data = data[index]
    y = y[index]
    break;
    
tmp_cont = 0.1                               #first bet of 1-class prior
k = 5                                        #number of new labels at each iteration (ntimes)
ntimes = int(min(150, 0.5*len(data)) // k)   #number of querying iterations
case = 2                                     #case 0 = perfect oracle, case 2 = imperfect oracle
name_ds = 'PageBlocks'                       #if you want to give a name to the final result...
n_splits = 5                                 #splits in crossvalidation (stratified)
n_iter = 5                                   #number of runs of the whole method

#This function runs the experiments. Pay attention that it saves 2 files: F1 results and prior results.
df1 = get_f1scores_wdiff_priors(data, y, real_cont, tmp_cont, k, ntimes, name_ds, case, n_splits, n_iter)