# Bank marketing data Set 

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/bank/bank-full.csv", sep=';').dropna()
data = pd.get_dummies(data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome'])
data['y'] = data['y'].astype('category')
data['y'] = data['y'].cat.codes
y = data.pop('y').values.astype(int)
X = data.values.astype('float32')
print(np.unique(y, return_counts=True), len(y))

# Random downsample majority class
sample = random.sample(np.arange(len(y))[y == 0].tolist(), 5289)
X = np.concatenate((X[sample,], X[y==1,]))
y = np.concatenate((y[sample], y[y==1]))
X = normalize(X, norm='l2', axis=0)
print(X.shape, len(y))
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([39922,  5289])) 45211
(10578, 81) 10578
(array([0, 1]), array([5289, 5289])) 10578


# Save data

In [26]:
data = pd.DataFrame(X)
data['y'] = y
data.to_csv("../../Datasets/bank/bank-data.csv")

# Banknote data set 

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/banknote_authentication.txt", sep=',', header=None).dropna()
y = data.pop(4).values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y), X.shape)

(array([0, 1]), array([762, 610])) 1372 (1372, 4)


In [2]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [3]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([7, 3]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([3, 7]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([2, 8]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([4, 6]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([6, 4]))


In [4]:
np.savez_compressed('splits/trsplitbanknote', ids_train)
np.savez_compressed('splits/tstsplitbanknote', ids_test)

# Heart disease data set

In [55]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/processed.cleveland.data", sep=',', header=None).dropna()
y = data.pop(13).values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))
y[y > 0] = 1
print(np.unique(y, return_counts=True), len(y))

(array([0, 1, 2, 3, 4]), array([160,  54,  35,  35,  13])) 297
(array([0, 1]), array([160, 137])) 297


# Weather in Australia dataset

In [8]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/weatherAUS.csv").dropna()
data = data.drop(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)
data['RainToday'].replace({'No': 0, 'Yes': 1}, inplace = True)
data['RainTomorrow'].replace({'No': 0, 'Yes': 1}, inplace = True)
y = data.pop('RainTomorrow').values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([43993, 12427])) 56420


In [27]:
from sklearn.model_selection import train_test_split

ids_train = []
ids_test = []

for i in range(20):
    id_train, id_test = train_test_split(np.arange(len(y)), test_size=.3)
    ids_train.append(id_train)
    ids_test.append(id_test)

In [28]:
for i in range(20):
    print(np.unique(y[ids_train[i][:10]], return_counts=True))

(array([0, 1]), array([8, 2]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([7, 3]))
(array([0, 1]), array([5, 5]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([6, 4]))
(array([0, 1]), array([8, 2]))
(array([0, 1]), array([9, 1]))
(array([0, 1]), array([9, 1]))


In [None]:
np.savez_compressed('splits/trsplitweather', ids_train)
np.savez_compressed('splits/tstsplitbanknote', ids_test)

# Breast cancer dataset

In [11]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/breast-cancer-wisconsin.data", sep=',', header=None).dropna()
data = data.drop(0, axis=1)
data[10].replace({2: 0, 4: 1}, inplace = True)
y = data.pop(10).values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([444, 239])) 683


# Titanic

In [27]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/titanic.csv")
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
data['Sex'].replace({'female': 0, 'male': 1}, inplace = True)
data = data.dropna()
y = data.pop('Survived').values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([424, 290])) 714


# Pima

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/diabetes.csv").dropna()
y = data.pop('Outcome').values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([500, 268])) 768


# Sonar

In [11]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/sonar.all-data", sep=',', header=None).dropna()
data[60].replace({'R': 0, 'M': 1}, inplace = True)
y = data.pop(60).values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([ 97, 111])) 208


# QSAR

In [22]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


data = pd.read_csv("../../Datasets/biodeg.csv", sep=';', header=None).dropna()
data[41].replace({'NRB': 0, 'RB': 1}, inplace = True)
y = data.pop(41).values.astype(int)
X = data.values.astype('float32')
X = normalize(X, norm='l2', axis=0)
print(np.unique(y, return_counts=True), len(y))

(array([0, 1]), array([699, 356])) 1055


# Save 20 random split indexes

In [23]:
from data_gen import *
from sklearn.model_selection import train_test_split


for i in range(20):
    print('save split : ', i)
    id_split, id_test = train_test_split(np.arange(len(y)), test_size=.33)
    id_l, id_u = train_test_split(id_split, train_size=50, random_state=2020)
    print(np.unique(y[id_l], return_counts=True), len(y[id_l]))
    Datasets.save_obj(id_test, 'qsar_test_'+str(i))
    Datasets.save_obj(id_l, 'qsar_l_'+str(i))
    Datasets.save_obj(id_u, 'qsar_u_'+str(i))

save split :  0
(array([0, 1]), array([34, 16])) 50
save split :  1
(array([0, 1]), array([34, 16])) 50
save split :  2
(array([0, 1]), array([33, 17])) 50
save split :  3
(array([0, 1]), array([31, 19])) 50
save split :  4
(array([0, 1]), array([37, 13])) 50
save split :  5
(array([0, 1]), array([36, 14])) 50
save split :  6
(array([0, 1]), array([35, 15])) 50
save split :  7
(array([0, 1]), array([34, 16])) 50
save split :  8
(array([0, 1]), array([37, 13])) 50
save split :  9
(array([0, 1]), array([36, 14])) 50
save split :  10
(array([0, 1]), array([33, 17])) 50
save split :  11
(array([0, 1]), array([35, 15])) 50
save split :  12
(array([0, 1]), array([35, 15])) 50
save split :  13
(array([0, 1]), array([33, 17])) 50
save split :  14
(array([0, 1]), array([34, 16])) 50
save split :  15
(array([0, 1]), array([36, 14])) 50
save split :  16
(array([0, 1]), array([30, 20])) 50
save split :  17
(array([0, 1]), array([35, 15])) 50
save split :  18
(array([0, 1]), array([30, 20])) 50
sav

In [27]:
len(id_test), len(id_l) + len(id_u)

(349, 706)

# Evaluate baseline models

In [25]:
from data_gen import *
from self_learning import msla
from sklearn.metrics import accuracy_score

err_SLA = []

for i in range(20):
    id_l = Datasets.load_obj('qsar_l_'+str(i))
    id_u = Datasets.load_obj('qsar_u_'+str(i))
    id_test = Datasets.load_obj('qsar_test_'+str(i))
    H0, Xl, yl, Xu, yu, thetas, _ = msla(X[id_l,], y[id_l], X[id_u,], random_state=2020)
    err_SLA.append(accuracy_score(y[id_test], H0.predict(X[id_test,])))
print(np.mean(err_SLA), np.std(err_SLA))

0.65243553008596 0.01885682909314944


In [26]:
from data_gen import *
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

scr_all = []
scr_50 = []

for i in range(20):
    id_l = Datasets.load_obj('qsar_l_'+str(i))
    id_u = Datasets.load_obj('qsar_u_'+str(i))
    id_all = np.concatenate((id_l, id_u))
    id_test = Datasets.load_obj('qsar_test_'+str(i))
    svm_all = LinearSVC(random_state=2020)
    svm_all.fit(X[id_all,], y[id_all])
    scr_all.append(svm_all.score(X[id_test,], y[id_test]))
    svm_50 = LinearSVC(random_state=2020)
    svm_50.fit(X[id_l,], y[id_l])
    scr_50.append(svm_50.score(X[id_test,], y[id_test]))
    
print('50 labeled data : ', np.mean(scr_50), np.std(scr_50))
print('all labeled data : ', np.mean(scr_all), np.std(scr_all))

50 labeled data :  0.6525787965616047 0.018764627110973188
all labeled data :  0.8001432664756447 0.028218404555866106


In [67]:
from data_gen import *
from linear_models import *
from sklearn.metrics import accuracy_score

scr_all = []
scr_50 = []

for i in range(1):
    id_l = Datasets.load_obj('heart_l_'+str(i))
    id_u = Datasets.load_obj('heart_u_'+str(i))
    id_all = np.concatenate((id_l, id_u))
    id_test = Datasets.load_obj('heart_test_'+str(i))
    LTF_all = Halfspace()
    LTF_all.fit(X[id_all,], y[id_all])
    scr_all.append(LTF_all.score(X[id_test,], y[id_test]))
    LTF_50 = Halfspace()
    LTF_50.fit(X[id_l,], y[id_l])
    scr_50.append(LTF_50.score(X[id_test,], y[id_test]))
    
print('50 labeled data : ', np.mean(scr_50), np.std(scr_50))
print('all labeled data : ', np.mean(scr_all), np.std(scr_all))

50 labeled data :  0.7373737373737373 0.0
all labeled data :  0.8181818181818182 0.0


In [66]:
from data_gen import *
from linear_models import *
from sklearn.metrics import accuracy_score

scr_all = []
scr_50 = []

for i in range(1):
    id_l = Datasets.load_obj('heart_l_'+str(i))
    id_u = Datasets.load_obj('heart_u_'+str(i))
    id_all = np.concatenate((id_l, id_u))
    id_test = Datasets.load_obj('heart_test_'+str(i))
    LTF_all = Halfspace()
    LTF_all.fit(X[id_all,], y[id_all])
    scr_all.append(LTF_all.score(X[id_test,], y[id_test]))
    LTF_50 = Halfspace()
    LTF_50.fit(X[id_l,], y[id_l])
    scr_50.append(LTF_50.score(X[id_test,], y[id_test]))
    
print('50 labeled data : ', np.mean(scr_50), np.std(scr_50))
print('all labeled data : ', np.mean(scr_all), np.std(scr_all))

50 labeled data :  0.7575757575757576 0.0
all labeled data :  0.8383838383838383 0.0
