In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import scipy as sc
from scipy.sparse.linalg import gmres 
import timeit
from sklearn import metrics  as m
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
import plotly 
from plotly.subplots import make_subplots 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression 
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.neighbors import KNeighborsClassifier 

# 0. Processing WII/UWave datasets

In [2]:
Z_MII = pd.read_csv('data/UWaveGestureLibraryX_TRAIN.txt',header = None)#  AllGestureWiimoteZ_TRAIN  
X_MII = pd.read_csv('data/UWaveGestureLibraryX_TRAIN.txt',header = None)
Y_MII = pd.read_csv('data/UWaveGestureLibraryX_TRAIN.txt',header = None)

In [3]:
Z_MIIT = pd.read_csv('data/UWaveGestureLibraryX_TEST.txt',header = None)#  AllGestureWiimoteZ_TEST 
X_MIIT = pd.read_csv('data/UWaveGestureLibraryX_TEST.txt',header = None)
Y_MIIT = pd.read_csv('data/UWaveGestureLibraryX_TEST.txt',header = None)

In [4]:
Z_ALL = pd.concat([Z_MII,Z_MIIT])
X_ALL = pd.concat([X_MII,X_MIIT])
Y_ALL = pd.concat([Y_MII,Y_MIIT])

In [5]:
Z_ALL = Z_ALL.reset_index().drop(columns=['index'])
X_ALL = X_ALL.reset_index().drop(columns=['index'])
Y_ALL = Y_ALL.reset_index().drop(columns=['index'])

In [6]:
Z_ALL = Z_ALL[0].values
X_ALL = X_ALL[0].values
Y_ALL = Y_ALL[0].values

In [7]:
list_all = [Z_ALL, X_ALL, Y_ALL]
list_ready = []

In [8]:
for ALL_ in list_all:
    Y_general = []
    for i in ALL_:
        Y_row = []
        for i in i.strip().split(" "):
            if i != '': 
                Y_row.append(float(i)) 
        Y_general.append(Y_row)
    list_ready.append(np.array(Y_general))

In [9]:
df_z = pd.DataFrame(list_ready[0] )
df_x = pd.DataFrame(list_ready[1] )
df_y = pd.DataFrame(list_ready[2] )

In [10]:
y_train_all = df_z[0].values

In [11]:
df_z = df_z.drop(columns=[0])
df_x = df_x.drop(columns=[0])
df_y = df_y.drop(columns=[0])

In [12]:
df_z = df_z[np.arange(1, df_z.count(axis=1).max()+1)]
df_x = df_x[np.arange(1, df_z.count(axis=1).max()+1)]
df_y = df_y[np.arange(1, df_z.count(axis=1).max()+1)]

In [13]:
df_z = df_z.fillna(value=0)
df_x = df_x.fillna(value=0)
df_y = df_y.fillna(value=0)

In [26]:
X_all = np.concatenate([df_z.values, df_x.values, df_y.values], axis=1)

In [15]:
y_train_all = y_train_all - 1

In [18]:
Y_ = np.copy(y_train_all)

# 1. Transforming data into graph structure

In [20]:
def generate_adj(X_array, nnodes, metric='minkowski', n_neighbours=25):
    Af = np.zeros((nnodes, nnodes))
    for feature in X_array:
        nbrs = NearestNeighbors(n_neighbors=n_neighbours, metric=metric ).fit(feature)
        distances, indices = nbrs.kneighbors(feature)
        for i in range(nnodes):
            Af[i, indices[i]] = 1
            Af[indices[i], i ] = 1
    return Af 

In [21]:
def calc_A_hat(adj_matrix: sp.spmatrix, delta, sigma, MMx) -> sp.spmatrix:
    nnodes = adj_matrix.shape[0]
    A = adj_matrix + sp.eye(nnodes)#Ω#@ D_invsqrt_corr
    D_vec = np.sum(A, axis=1).A1 
    lsigma = sigma - 1
    rsigma = - sigma
    wsigma = -2*sigma + 1
    
    D_l = sp.diags(np.power(D_vec, lsigma)) 
    D_r= sp.diags(np.power(D_vec, rsigma ) )
    Dw = sp.diags(np.power(D_vec, wsigma ) )
    S_ = MMx@ Dw 
    
    return S_ , D_l@A@D_r  - delta* S_

# 2. optimization parts for PRPCA algorithm

In [22]:
def IP(A, Z, Y, iter_,  alpha):
    A = np.copy(A)
    Z = np.copy(Z)
    Y = np.copy(Y) 
    start = timeit.default_timer()
    for _ in range(iter_):
        Z =  alpha * AHAT@Z   + (1-alpha) * Y
        Z = normalize(Z,'l1')
    print('time(s):', timeit.default_timer() - start)
    return Z

In [23]:
def GMRES(A, Y, alpha, k, tol):
    A = np.copy(A)
    Y = np.copy(Y) 
    predicts = []
    for j in range(k): 
        temp_ = gmres(A, (1-alpha)*Y[:,j], tol=tol)[0] 
        predicts.append([temp_])
    return np.concatenate(predicts).T

# 3. Computation of PRPCA, PaSVM and PaLR

In [179]:
from sklearn.model_selection import train_test_split
X_list = [[df_x.values, df_y.values, df_z.values] ]
names = ['all_values'] 
         
seed = 0
sigma = 1
nl= 20
alpha= 0.9  
iter_ = 10
tol = 1e-03 
delta = 1e-03
beta = 0.9 
Z_collect = []
nnodes = X_all.shape[0]
for ind_, X in enumerate(X_list):
    mean_acc_prpca = []
    mean_acc_svm = []
    mean_acc_lr = []
    for seed in np.arange(1): 
        Xall = np.concatenate(X, axis=1) 
        MMx = np.zeros((nnodes, nnodes))
        nnodes = Xall.shape[0] 
        w1 = 0.001/len(X) 
        # covariance for each sensor space
        for x in X:
            Xn = np.copy(x) 
            Xn = Xn - np.median(Xn, axis=0)
            S =  np.dot(Xn, Xn.T) / (Xn.shape[0] - 1 )
            MMx +=  w1*S 
        # adjacency matrix for each sensor space
        Af = generate_adj(X, nnodes, n_neighbours=25, metric='euclidean' )
        mmc, AHAT = calc_A_hat(Af,  delta, sigma, MMx ) 
        rex = (np.identity(nnodes)  - alpha * AHAT )
        # random split dataset for training
        rs = np.random.RandomState(seed=seed) 
        ind0lab = rs.choice(np.where(Y_ == 0 )[0], nl, replace=False)
        ind1lab = rs.choice(np.where(Y_ == 1 )[0], nl, replace=False)
        ind2lab = rs.choice(np.where(Y_ == 2 )[0], nl, replace=False)
        ind3lab = rs.choice(np.where(Y_ == 3 )[0], nl, replace=False)
        ind4lab = rs.choice(np.where(Y_ == 4 )[0], nl, replace=False) 
        ind5lab = rs.choice(np.where(Y_ == 5 )[0], nl, replace=False) 
        ind6lab = rs.choice(np.where(Y_ == 6 )[0], nl, replace=False)
        ind7lab = rs.choice(np.where(Y_ == 7 )[0], nl, replace=False) 
        all_lab = np.concatenate([ ind0lab, ind1lab, ind2lab, ind3lab,  ind4lab, ind5lab,
                         ind6lab, ind7lab ]) 
        y_train = np.zeros((X_all.shape[0], 8)) 
        for i in all_lab:
            y_train[i, int(Y_[i])] =  1  
        # PPRPCA training
        Z = GMRES(A=rex, Y=y_train, alpha=alpha, k=y_train.shape[1], tol=tol) 
        # training of PaSVM and PaLR
        X_glob = np.concatenate([Xall, Z],axis=-1 )
        Z_glob = np.argmax(Z , axis=-1)
        mean_acc_prpca.append(m.accuracy_score(Y_  , np.argmax(np.array(Z), axis=1)   ))  
        # PRPCA self-labelling
        X_train, X_test, z_train, z_test = train_test_split(X_glob, Z_glob, test_size=0.3, random_state=1) 
        X_train, X_test, y_train, y_test = train_test_split(X_glob, Y_, test_size=0.3, random_state=1) 
        # PaSVM
        svc = SVC(gamma='auto')
        svc.fit( X_train, z_train)
        scv_predict = svc.predict( X_test)
        mean_acc_svm.append(m.accuracy_score(y_test, scv_predict)) 
        # PaLR
        clf = LogisticRegression(random_state=0) 
        clf.fit(X_train, z_train)
        scv_predict = clf.predict(X_test)
        mean_acc_lr.append(m.accuracy_score(y_test, scv_predict))
    print(names[ind_], 'prlg', np.mean(mean_acc_lr)) 
    print(names[ind_], 'prsvc', np.mean(mean_acc_svm)) 
    print(names[ind_], 'prpca', np.mean(mean_acc_prpca)) 

all_values prpca 0.6967396158999554
all_values prlg 0.6287202380952381
all_values prsvc 0.6949404761904762
all_values prpca 0.6967396158999554



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# 4. Computation of LP, KNN, SVM and LR 

In [178]:
nl = 20
nnodes = X_all.shape[0]
rs = np.random.RandomState(seed=0) 
ind0lab = rs.choice(np.where(Y_ == 0 )[0], nl, replace=False)
ind1lab = rs.choice(np.where(Y_ == 1 )[0], nl, replace=False)
ind2lab = rs.choice(np.where(Y_ == 2 )[0], nl, replace=False)
ind3lab = rs.choice(np.where(Y_ == 3 )[0], nl, replace=False)
ind4lab = rs.choice(np.where(Y_ == 4 )[0], nl, replace=False) 
ind5lab = rs.choice(np.where(Y_ == 5 )[0], nl, replace=False) 
ind6lab = rs.choice(np.where(Y_ == 6 )[0], nl, replace=False)
ind7lab = rs.choice(np.where(Y_ == 7 )[0], nl, replace=False) 
all_lab = np.concatenate([ ind0lab, ind1lab, ind2lab, ind3lab,  ind4lab, ind5lab, ind6lab, ind7lab, ]) 
y_train = np.zeros((X_all.shape[0], 8)) 
for i in all_lab:
    y_train[i, int(Y_[i])] =  1 
    
neigh = KNeighborsClassifier(n_neighbors=25, metric='euclidean')
neigh.fit(X_all[all_lab], Y_[all_lab])
yknn_predict = neigh.predict(X_all)
print('knn', m.accuracy_score(Y_, yknn_predict))
y_lp = np.ones(nnodes)*-1
y_lp[all_lab] = Y_[all_lab]
 
lp = LabelPropagation(kernel='rbf', gamma=2, n_neighbors=25)
lp.fit(X_all, y_lp)
yLp_predict = lp.predict(X_all)
print('lp',m.accuracy_score(Y_, yLp_predict) )
 
clf = LogisticRegression(random_state=0) 
clf.fit(X_all[all_lab], Y_[all_lab])
scv_predict = clf.predict(X_all)
print('lg',m.accuracy_score(Y_, scv_predict) )
 
svc = SVC(gamma='auto')
svc.fit(X_all[all_lab], Y_[all_lab])
scv_predict = svc.predict(X_all)
print('svm',m.accuracy_score(Y_, scv_predict) )

knn 0.5895489057615007



invalid value encountered in true_divide


max_iter=1000 was reached without convergence.



lp 0.12483251451540867



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



lg 0.5587315765966949
svm 0.6873604287628405
