## Mediamill dataset

In [1]:
from sklearn.preprocessing import normalize
from xclib.data import data_utils
import pandas as pd
import numpy as np

features, tabels, num_samples, num_features, num_labels = data_utils.read_data('../../Datasets/Mediamill/Mediamill_data.txt')
trSplit = pd.read_csv("../../Datasets/Mediamill/mediamill_trSplit.txt", header=None, sep=' ').dropna()
tstSplit = pd.read_csv("../../Datasets/Mediamill/mediamill_tstSplit.txt", header=None, sep=' ').dropna()
X = features.todense()
X = X.astype('float32')
X = normalize(X, norm='l2', axis=0)
labels = tabels.todense()
np.savez_compressed('../mediamill', x=X, lab=labels)
num_samples, num_features, num_labels

(43907, 120, 101)

In [5]:
msq = np.argsort(np.ravel(np.sum(labels, axis=0, dtype=int)))
np.unique(np.ravel(labels[:,msq[-2]]), return_counts=True)

(array([0., 1.], dtype=float32), array([15969, 27938]))

In [7]:
y = np.ravel(labels[:,msq[-2]]).astype(int)

In [10]:
len(tstSplit[0]), len(tSplit[0])

12914

In [8]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values[:50]-1,], y[trSplit[i].values[:50]-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.6364797893758711 0.0015737673505691015


In [11]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values-1,], y[trSplit[i].values-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.6934412265758092 0.0016118645760910245


In [10]:
from linear_models import *

err = []
for i in range(10):
    ltf = Halfspace()
    ltf.fit(X[trSplit[i].values[:50]-1,], y[trSplit[i].values[:50]-1])
    err.append(ltf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.6646662536781787 0.031555376716985


In [12]:
from data_gen import *
from self_learning import msla
from sklearn.metrics import accuracy_score

err_SLA = []

for i in range(10):
    id_l = trSplit[i].values[:50]-1
    id_u = trSplit[i].values[:50]-1
    id_test = tstSplit[i].values-1
    H0, Xl, yl, Xu, yu, thetas, _ = msla(X[id_l,], y[id_l], X[id_u,], random_state=2020)
    err_SLA.append(accuracy_score(y[id_test], H0.predict(X[id_test,])))
print(np.mean(err_SLA), np.std(err_SLA))

0.6364797893758711 0.0015737673505691015


## Bibtex dataset

In [2]:
from sklearn.preprocessing import normalize
from xclib.data import data_utils
import pandas as pd
import numpy as np

features, tabels, num_samples, num_features, num_labels = data_utils.read_data('../../Datasets/Bibtex/Bibtex_data.txt')
trSplit = pd.read_csv("../../Datasets/Bibtex/bibtex_trSplit.txt", header=None, sep=' ').dropna()
tstSplit = pd.read_csv("../../Datasets/Bibtex/bibtex_tstSplit.txt", header=None, sep=' ').dropna()
X = features.todense()
X = X.astype('float32')
X = normalize(X, norm='l2', axis=0)
labels = tabels.todense()
np.savez_compressed('../bibtex', x=X, lab=labels)
num_samples, num_features, num_labels

(7395, 1836, 159)

In [9]:
msq = np.argsort(np.ravel(np.sum(labels, axis=0, dtype=int)))
np.unique(np.ravel(labels[:,msq[-1]]), return_counts=True)

(array([0., 1.], dtype=float32), array([6353, 1042]))

In [8]:
y = np.ravel(labels[:,msq[-1]]).astype(int)
len(tstSplit[0]), len(trSplit[0])

(2515, 4880)

In [9]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values[:50]-1,], y[trSplit[i].values[:50]-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.8589264413518887 0.004564085837703927


In [10]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values-1,], y[trSplit[i].values-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.9999204771371769 0.00015904572564613416


In [11]:
from linear_models import *

err = []
for i in range(10):
    ltf = Halfspace()
    ltf.fit(X[trSplit[i].values[:50]-1,], y[trSplit[i].values[:50]-1])
    err.append(ltf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.8610735586481113 0.007838550476565639


In [12]:
from linear_models import *

err = []
for i in range(10):
    ltf = Halfspace()
    ltf.fit(X[trSplit[i].values-1,], y[trSplit[i].values-1])
    err.append(ltf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.9650099403578528 0.002943420361562099


In [13]:
from data_gen import *
from self_learning import msla
from sklearn.metrics import accuracy_score

err_SLA = []

for i in range(10):
    id_l = trSplit[i].values[:50]-1
    id_u = trSplit[i].values[:50]-1
    id_test = tstSplit[i].values-1
    H0, Xl, yl, Xu, yu, thetas, _ = msla(X[id_l,], y[id_l], X[id_u,], random_state=2020)
    err_SLA.append(accuracy_score(y[id_test], H0.predict(X[id_test,])))
print(np.mean(err_SLA), np.std(err_SLA))

0.8588866799204771 0.004559407122135116


## Delicious dataset

In [3]:
from sklearn.preprocessing import normalize
from xclib.data import data_utils
import pandas as pd
import numpy as np

features, tabels, num_samples, num_features, num_labels = data_utils.read_data('../../Datasets/Delicious/Delicious_data.txt')
trSplit = pd.read_csv("../../Datasets/Delicious/delicious_trSplit.txt", header=None, sep=' ').dropna()
tstSplit = pd.read_csv("../../Datasets/Delicious/delicious_tstSplit.txt", header=None, sep=' ').dropna()
X = features.todense()
X = X.astype('float32')
X = normalize(X, norm='l2', axis=0)
labels = tabels.todense()
np.savez_compressed('../delicious', x=X, lab=labels)
num_samples, num_features, num_labels

(16105, 500, 983)

In [13]:
msq = np.argsort(np.ravel(np.sum(labels, axis=0, dtype=int)))
np.unique(np.ravel(labels[:,msq[-1]]), return_counts=True)

(array([0., 1.], dtype=float32), array([9610, 6495]))

In [3]:
y = np.ravel(labels[:,msq[-1]]).astype(int)
len(tstSplit[0]), len(trSplit[0])

(3185, 12920)

In [6]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values[:50]-1,], y[trSplit[i].values[:50]-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.6004395604395604 0.00620394180560496


In [7]:
from sklearn.svm import LinearSVC

err = []
for i in range(10):
    clf = LinearSVC(random_state=2020)
    clf.fit(X[trSplit[i].values-1,], y[trSplit[i].values-1])
    err.append(clf.score(X[tstSplit[i].values-1,], y[tstSplit[i].values-1]))
print(np.mean(err), np.std(err))

0.6505494505494507 0.005035308055693424
