In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

from keras.models import load_model


import tensorflow as tf
from collections import defaultdict


import sys, os
# add directories in src/ to path
sys.path.insert(0, 'SpectralNet-master/src/applications/')
sys.path.insert(0, 'SpectralNet-master/src/')
from spectralnet import run_net
from core.data import get_data

from sklearn.neighbors import LSHForest
import joblib



# '''
# spectralnet.py: contains run function for spectralnet
# '''
import sys, os, pickle
import traceback
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import normalized_mutual_info_score as nmi

import keras.backend as K
from keras.models import Model, load_model
from keras.layers import Input, Lambda
from keras.optimizers import RMSprop

from core import train
from core import costs
from core import networks
from core.layer import stack_layers
from core.util import get_scale, print_accuracy, get_cluster_sols, LearningHandler, make_layer_list, train_gen, get_y_preds


Using TensorFlow backend.


In [2]:
def spectralNet_FromWeights(data,params,siamWeightsPath,specWeightsPath):
    
     #
    # UNPACK DATA
    #

    x_train, y_train, x_val, y_val, x_test, y_test = data['spectral']['train_and_test']
    x_train_unlabeled, y_train_unlabeled, x_train_labeled, y_train_labeled = data['spectral']['train_unlabeled_and_labeled']
    x_val_unlabeled, y_val_unlabeled, x_val_labeled, y_val_labeled = data['spectral']['val_unlabeled_and_labeled']

    if 'siamese' in params['affinity']:
        pairs_train, dist_train, pairs_val, dist_val = data['siamese']['train_and_test']

    x = np.concatenate((x_train, x_val, x_test), axis=0)
    y = np.concatenate((y_train, y_val, y_test), axis=0)

    if len(x_train_labeled):
        y_train_labeled_onehot = OneHotEncoder().fit_transform(y_train_labeled.reshape(-1, 1)).toarray()
    else:
        y_train_labeled_onehot = np.empty((0, len(np.unique(y))))

    #
    # SET UP INPUTS
    #

    # create true y placeholder (not used in unsupervised training)
    y_true = tf.placeholder(tf.float32, shape=(None, params['n_clusters']), name='y_true')

    batch_sizes = {
            'Unlabeled': params['batch_size'],
            'Labeled': params['batch_size'],
            'Orthonorm': params.get('batch_size_orthonorm', params['batch_size']),
            }

    input_shape = x.shape[1:]

    # spectralnet has three inputs -- they are defined here
    inputs = {
            'Unlabeled': Input(shape=input_shape,name='UnlabeledInput'),
            'Labeled': Input(shape=input_shape,name='LabeledInput'),
            'Orthonorm': Input(shape=input_shape,name='OrthonormInput'),
            }

    #
    # DEFINE AND TRAIN SIAMESE NET
    #

    # run only if we are using a siamese network
    if params['affinity'] == 'siamese':
        siamese_net = networks.SiameseNet(inputs, params['arch'], params.get('siam_reg'), y_true)

        history = siamese_net.train(pairs_train, dist_train, pairs_val, dist_val,
                params['siam_lr'], params['siam_drop'], params['siam_patience'],
                1, params['siam_batch_size'])
        siamese_net.net.load_weights(siamWeightsPath, by_name=True)

    else:
        siamese_net = None

    #
    # DEFINE AND TRAIN SPECTRALNET
    #

    spectral_net = networks.SpectralNet(inputs, params['arch'],
            params.get('spec_reg'), y_true, y_train_labeled_onehot,
            params['n_clusters'], params['affinity'], params['scale_nbr'],
            params['n_nbrs'], batch_sizes, siamese_net, x_train, len(x_train_labeled))

    spectral_net.train(
            x_train_unlabeled, x_train_labeled, x_val_unlabeled,
            params['spec_lr'], params['spec_drop'], params['spec_patience'],
            1)

    spectral_net.net.load_weights(specWeightsPath, by_name=True)

    print("finished training")

    #
    # EVALUATE
    #

    #get final embeddings
    x_spectralnet = spectral_net.predict(x)

    #get accuracy and nmi
    kmeans_assignments, km = get_cluster_sols(x_spectralnet, ClusterClass=KMeans, n_clusters=params['n_clusters'], init_args={'n_init':10})
    
    kmeans_assignments = km.predict(x_spectralnet)
    
    y_spectralnet, _ = get_y_preds(kmeans_assignments, y, params['n_clusters'])
    print_accuracy(kmeans_assignments, y, params['n_clusters'])

    return km, siamese_net, spectral_net #,x_spectralnet, y_spectralnet

In [3]:

params = defaultdict(lambda: None)
#change to dset  = mnist and codespace = True
general_params = {
        'dset': 'new',                  # dataset: reuters / mnist
        'val_set_fraction': 0.1,            # fraction of training set to use as validation
        'precomputedKNNPath': '',           # path for precomputed nearest neighbors (with indices and saved as a pickle or h5py file)
        'siam_batch_size': 128,             # minibatch size for siamese net
        }
params.update(general_params)
#         'train_labeled_fraction':True,
#         'val_labeled_fraction':True,
my_params = {

        'n_clusters': 26,                   # number of clusters in data
        'use_code_space': False,             # enable / disable code space embedding
        'affinity': 'siamese',              # affinity type: siamese / knn
        'n_nbrs': 10,                        # number of nonzero entries (neighbors) to use for graph Laplacian affinity matrix
        'scale_nbr': 2,                     # neighbor used to determine scale of gaussian graph Laplacian; calculated by
                                            # taking median distance of the (scale_nbr)th neighbor, over a set of size batch_size
                                            # sampled from the datset

        'siam_k': 2,                        # threshold where, for all k <= siam_k closest neighbors to x_i, (x_i, k) is considered
                                            # a 'positive' pair by siamese net

        'siam_ne': 50,                     # number of training epochs for siamese net
        'spec_ne': 150,                     # number of training epochs for spectral net
        'siam_lr': 1e-3,                    # initial learning rate for siamese net
        'spec_lr': 1e-3,                    # initial learning rate for spectral net #hardcoded in network.py?
        'siam_patience': 10,                # early stopping patience for siamese net
        'spec_patience': 20,                # early stopping patience for spectral net
        'siam_drop': 0.1,                   # learning rate scheduler decay for siamese net
        'spec_drop': 0.1,                   # learning rate scheduler decay for spectral net
        'batch_size': 1024,                 # batch size for spectral net
        'siam_reg': None,                   # regularization parameter for siamese net
        'spec_reg': None,                   # regularization parameter for spectral net
        'siam_n': None,                     # subset of the dataset used to construct training pairs for siamese net
        'siamese_tot_pairs': 600000,        # total number of pairs for siamese net
        'arch': [                           # network architecture. if different architectures are desired for siamese net and
                                            #   spectral net, 'siam_arch' and 'spec_arch' keys can be used
            {'type': 'relu', 'size': 1024},
            {'type': 'relu', 'size': 1024},
            {'type': 'relu', 'size': 512},
            {'type': 'relu', 'size': 10},
            ],
        'use_approx': False,                # enable / disable approximate nearest neighbors
        'use_all_data': False,               # enable to use all data for training (no test set)
        }
params.update(my_params)

In [4]:
def train_and_save(outName= 'data/testModel', dataIn='data/refData.npz'):
    #testData for training
    # tf.test.is_gpu_available()
    rr = np.load(dataIn, allow_pickle=True)
    #X_train data is the feature for spectral clustering(Midpoint distance + dihedral angles between helices)
    #16 total, eight each
    #y_name is the name of the protein
    #y_ is the assigned cluster labels from real spectral clustering
    y_start , y_name_start, X_train_start, featNames  = [rr[f] for f in rr.files]

    X_train_start = X_train_start[:,:-8] #remove phi values and length

    #Warning! test Train splits hard coded here for original data
    X_test = X_train_start[22000:,:]
    y_test = y_start[22000:]

    X_train = X_train_start[:22000,:]
    y_train = y_start[:22000]

    #run this to organize the data into the approriate dictionary formats
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    new_dataset_data = (X_train, X_test, y_train, y_test)
    ata = get_data(params,new_dataset_data)
    
    #train net with this code
    siamese_net_model, spectral_net_model, x_spectralnet, y_spectralnet,km = run_net(ata, params)
    
    #save the weights for loading by spectralNet_FromWeights
    spectral_net_model.net.save_weights(f'{outName}_spectral_net.tf')
    siamese_net_model.net.save_weights(f'{outName}_siamese_net.tf')

In [6]:
train_and_save(outName= 'testData/testNet', dataIn='testData/test_clusterBcov.npz')

(22000, 12) (22000,) (5877, 12) (5877,)
computing k=2 nearest neighbors...
creating pairs...
ks 19800 2 2 2
Iter: 0/19800
Iter: 10000/19800
computing k=2 nearest neighbors...
creating pairs...
ks 2200 2 2 2
Iter: 0/2200
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch: 0, loss=387.535047, val_loss=1.869559
Epoch: 1, loss=35.080703, val_loss=0.385769
Epoch: 2, loss=6.610832, val_loss=0.074796
Epoch: 3, loss=2.332759, val_loss=0.005220
Epoch: 4, loss

Epoch: 65, loss=0.027122, val_loss=0.113087
Epoch: 66, loss=0.015474, val_loss=0.097391
Epoch: 67, loss=0.025710, val_loss=0.106015
Epoch: 68, loss=0.033795, val_loss=0.150148
Epoch: 69, loss=0.017013, val_loss=0.145170
Epoch: 70, loss=0.034197, val_loss=0.122890
Epoch: 71, loss=0.017697, val_loss=0.140515
Epoch: 72, loss=0.013563, val_loss=0.141752
Epoch: 73, loss=0.018711, val_loss=0.173549
Epoch: 74, loss=0.023249, val_loss=0.110035
Epoch: 75, loss=0.028166, val_loss=0.098514
Epoch: 76, loss=0.026381, val_loss=0.127882
Epoch: 77, loss=0.028938, val_loss=0.161364
Epoch: 78, loss=0.024517, val_loss=0.165915
Epoch: 79, loss=0.036589, val_loss=0.095407
Epoch: 80, loss=0.024128, val_loss=0.123871
Epoch: 81, loss=0.017409, val_loss=0.144919
Epoch: 82, loss=0.030628, val_loss=0.134340
Epoch: 83, loss=0.025710, val_loss=0.132847
Epoch: 84, loss=0.020977, val_loss=0.096508
Epoch: 85, loss=0.028765, val_loss=0.124472
Epoch: 86, loss=0.020990, val_loss=0.096492
Epoch: 87, loss=0.017053, val_lo



In [12]:
def load_and_predict(out_name='specNet_predicted',refData='data/refData.npz', specWeightsPath='data/SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf', 
                siamWeightsPath='data/Siamese_SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf'):
    #testData used in the training, needs to be reloaded for prediction of clusters,for re-predicting
    #original clusters to maintain consistent cluster numbers
    #Warning! test Train splits hard coded here for original data
    
    # tf.test.is_gpu_available()
    rr = np.load(refData, allow_pickle=True)
    #X_train data is the feature for spectral clustering(Midpoint distance + dihedral angles between helices)
    #16 total, eight each
    #y_name is the name of the protein
    #y_ is the assigned cluster labels from real spectral clustering
    y_start , y_name_start, X_train_start, featNames  = [rr[f] for f in rr.files]

    X_train_start = X_train_start[:,:-8] #remove phi values and length


    X_test = X_train_start[22000:,:]
    y_test = y_start[22000:]

    X_train = X_train_start[:22000,:]
    y_train = y_start[:22000]

    #run this to organize the data into the approriate dictionary formats
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    new_dataset_data = (X_train, X_test, y_train, y_test)
    ata = get_data(params,new_dataset_data)
    #due to the special layers and old format, remake the network per the regular training code
    #and then load the weights after one epoch of training
    km, siamese_net_model, spectral_net_model = spectralNet_FromWeights(ata, params,siamWeightsPath,
                                                                       specWeightsPath)
    
    #load data to predict
    direc = 'data/'
    name = 'to_predict'
    rr = np.load(f'{direc}{name}.npz', allow_pickle=True)
    data = [rr[f] for f in rr.files]
    
    #predict and assign clusters for new data
    x_spec = spectral_net_model.predict(data[0])
    clusters_assignments = km.predict(x_spec)
    
    #save the data to give back to clustering class (new python environment easier to use)
    np.savez_compressed(f'{direc}{out_name}.npz',data = clusters_assignments)
    
    #repredict the inital data to get consistent cluster numbers 
    #original dataset assignments

    x_spec_orig = spectral_net_model.predict(X_train_start)
    cluster_assignments_orig = km.predict(x_spec_orig)

    np.savez_compressed(f'{direc}{out_name}_original_clusters.npz', data=cluster_assignments_orig)


load_and_predict(refData='testData/test_clusterBcov.npz',siamWeightsPath='testData/testNet_siamese_net.tf',
                specWeightsPath='testData/testNet_spectral_net.tf')
    
    


(22000, 12) (22000,) (5877, 12) (5877,)
computing k=2 nearest neighbors...
creating pairs...
ks 19800 2 2 2
Iter: 0/19800
Iter: 10000/19800
computing k=2 nearest neighbors...
creating pairs...
ks 2200 2 2 2
Iter: 0/2200
Epoch 1/1
Epoch: 0, loss=419.167578, val_loss=2.478963
finished training
confusion matrix: 
[[2345    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    2    0    0  371    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0 1625    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    2    0    0 2103    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0

In [4]:
#the data is feed to siamese net to predict neighbors
#prediction is then used by spectral net to predict graph laplacian embeddings?
#kmeans organize the data in the graph laplacian space for ideal clustering

In [5]:
#testData for training
# tf.test.is_gpu_available()
rr = np.load('data/refData.npz', allow_pickle=True)
#X_train data is the feature for spectral clustering(Midpoint distance + dihedral angles between helices)
#16 total, eight each
#y_name is the name of the protein
#y_ is the assigned cluster labels from real spectral clustering
y_start , y_name_start, X_train_start, featNames  = [rr[f] for f in rr.files]

X_train_start = X_train_start[:,:-8] #remove phi values and length


X_test = X_train_start[22000:,:]
y_test = y_start[22000:]

X_train = X_train_start[:22000,:]
y_train = y_start[:22000]

#run this to organize the data into the approriate dictionary formats
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

new_dataset_data = (X_train, X_test, y_train, y_test)
ata = get_data(params,new_dataset_data)

(22000, 12) (22000,) (5877, 12) (5877,)
computing k=2 nearest neighbors...
creating pairs...
ks 19800 2 2 2
Iter: 0/19800
Iter: 10000/19800
computing k=2 nearest neighbors...
creating pairs...
ks 2200 2 2 2
Iter: 0/2200


In [6]:
#train net with this code
#siamese_net_model, spectral_net_model, x_spectralnet, y_spectralnet,km = run_net(ata, params)

In [7]:
#save the weights for loading by spectralNet_FromWeights
# spectral_net_model.net.save_weights('data/SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf')
# siamese_net_model.net.save_weights('data/Siamese_SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf')

In [8]:
#due to the special layers and old format, remake the network per the regular training code
#
specWeightsPath='data/SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf'
siamWeightsPath='data/Siamese_SpecNet_bCov4H_cluster26_NN10_weights_jul1.tf'

km, siamese_net_model, spectral_net_model = spectralNet_FromWeights(ata, params,siamWeightsPath,
                                                                   specWeightsPath)

W0813 00:27:16.303197 10756 deprecation_wrapper.py:119] From C:\Users\Crimson_King\anaconda3\envs\specNetGPU\lib\site-packages\keras\backend\tensorflow_backend.py:488: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0813 00:27:16.305198 10756 deprecation_wrapper.py:119] From C:\Users\Crimson_King\anaconda3\envs\specNetGPU\lib\site-packages\keras\backend\tensorflow_backend.py:63: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0813 00:27:16.306198 10756 deprecation_wrapper.py:119] From C:\Users\Crimson_King\anaconda3\envs\specNetGPU\lib\site-packages\keras\backend\tensorflow_backend.py:3626: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0813 00:27:16.344207 10756 deprecation.py:506] From C:\Users\Crimson_King\anaconda3\envs\specNetGPU\lib\site-packages\keras\backend\tensorflow_backend.py:1238: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_di

Epoch 1/1


W0813 00:27:19.672891 10756 deprecation_wrapper.py:119] From SpectralNet-master/src\core\networks.py:137: The name tf.train.RMSPropOptimizer is deprecated. Please use tf.compat.v1.train.RMSPropOptimizer instead.

W0813 00:27:19.766862 10756 deprecation.py:506] From C:\Users\Crimson_King\anaconda3\envs\specNetGPU\lib\site-packages\tensorflow\python\training\rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Epoch: 0, loss=444.169343, val_loss=2.219822
finished training
confusion matrix: 
[[   0 2345    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    1    0  471    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   0  536    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    2    0    0    0    0    0 2103    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0  955    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0 1625    0    0    0]
 [   0    0    0    0    0    0    0    0    0    1 1453    0    0    0
     0    0    0    0    0    0    0

In [9]:
#load the data to predict
direc = 'data/'
name = 'to_predict'
rr = np.load(f'{direc}{name}.npz', allow_pickle=True)
data = [rr[f] for f in rr.files]

In [10]:
#predict and assign clusters for new data
x_spec = spectral_net_model.predict(data[0])
clusters_assignments = km.predict(x_spec)



In [11]:
#save the data to give back to clustering class (new python environment easier to use)
name='clusters_specNet'
np.savez_compressed(f'{direc}{name}.npz',data = clusters_assignments)


In [12]:

#repredict the inital data to get consistent cluster numbers 
#original dataset assignments

x_spec_orig = spectral_net_model.predict(X_train_start)
cluster_assignments_orig = km.predict(x_spec_orig)

name_orig = 'original_clusters'

np.savez_compressed(f'{direc}{name_orig}.npz', data=cluster_assignments_orig)

In [13]:
#confirm the accuracy of the loaded
x_spectralnet = spectral_net_model.predict(X_test)
kmeans_assignments = km.predict(x_spectralnet)
y_spectralnet, _ = get_y_preds(kmeans_assignments, y_test, params['n_clusters'])

y_pred, confusion_matrix = get_y_preds(kmeans_assignments, y_test,params['n_clusters'])
# calculate the accuracy
np.mean(y_pred == y_test)

0.9555895865237366