In [1]:
import matplotlib.pyplot as plt

import threading

import os
import time

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.python.client import device_lib

from sklearn.datasets import make_classification

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    print(local_device_protos)
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

def make_data(filepath, n_obs, n_dim, seed, K):

    try:
        os.remove(filepath)
    except:
        print('file not found')
    finally:
        (X, Y) = make_classification(n_samples            = n_obs    , 
                                     n_features           = n_dim    ,
                                     n_informative        = n_dim    ,
                                     n_redundant          = 0        ,
                                     n_classes            = K        ,
                                     n_clusters_per_class = 1        ,
                                     shuffle              = True     ,
                                     class_sep            = 1.5      ,
                                     random_state         = seed      )
        
        np.savez(filepath, X=X, Y=Y)

    return True

  return f(*args, **kwds)


In [2]:
n_obs = 200000000
n_dim = 2
K     = 3
GPU_names = get_available_gpus()
n_max_iters = 20
seed = 800594

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8439826486129689063
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11314364416
locality {
  bus_id: 1
}
incarnation: 197756180581825395
physical_device_desc: "device: 0, name: Tesla K40m, pci bus id: 0000:04:00.0, compute capability: 3.5"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 11314364416
locality {
  bus_id: 1
}
incarnation: 2035865274393290917
physical_device_desc: "device: 1, name: Tesla K40m, pci bus id: 0000:05:00.0, compute capability: 3.5"
, name: "/device:GPU:2"
device_type: "GPU"
memory_limit: 11314364416
locality {
  bus_id: 1
}
incarnation: 10900420732246120775
physical_device_desc: "device: 2, name: Tesla K40m, pci bus id: 0000:08:00.0, compute capability: 3.5"
, name: "/device:GPU:3"
device_type: "GPU"
memory_limit: 11312372122
locality {
  bus_id: 1
}
incarnation: 7006096846743645924
physical_device_desc: "device: 3, name: Tesla K40m, pci bus id: 0000:09

In [None]:
make_data('test-data.npz', n_obs, n_dim, seed, K)

In [3]:
with np.load('test-data.npz') as data:
    data_X = data['X']
    data_Y = data['Y']

    
maxsize = 2 * 1024 * 1024 * 1024
size_of_each = data_X.shape[1] * data_X.dtype.itemsize

initial_centers = data_X[0:K, :]

In [4]:
data_placeholder = tf.placeholder(data_X.dtype, data_X.shape)

dataset = tf.data.Dataset.from_tensor_slices(data_placeholder)
num_items = np.floor(maxsize / size_of_each)
dataset = dataset.batch(num_items)

iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

In [9]:
def distributed_kmeans(GPU_names, batch_data, max_iters, sess):
    partial_directions = []
    partial_values = []
    partial_results = []

    with tf.name_scope('global'):
        with tf.device('/cpu:0'):
            batch_data.set_shape((num_items, batch_data.get_shape()[1]))
            print(batch_data)
            parts = tf.split(batch_data, len(GPU_names), 0)
            print(parts)
            global_centroids = tf.Variable(initial_centers)
    
    for GPU_num in range(len(GPU_names)):
        GPU_name = GPU_names[GPU_num]
        
        (X_mat) = parts[GPU_num]
        (N, M) = X_mat.get_shape().as_list()
    
        with tf.name_scope('scope_' + str(GPU_num)):
            with tf.device(GPU_name) :
                ####
                # In the coments we denote :
                # => N = Number of Observations
                # => M = Number of Dimensions
                # => K = Number of Centers
                ####
                # Data for GPU GPU_num to Clusterize
                X = tf.Variable(X_mat)

                # Reshapes rep_centroids and rep_points to format N x K x M so that 
                # the 2 matrixes have the same size
                rep_centroids = tf.reshape(tf.tile(global_centroids, [N, 1]), [N, K, M])
                rep_points = tf.reshape(tf.tile(X, [1, K]), [N, K, M])

                # Calculates sum_squares, a matrix of size N x K
                # This matrix is not sqrt((X-Y)^2), it is just(X-Y)^2
                # Since we need just the argmin(sqrt((X-Y)^2)) wich is equal to 
                # argmin((X-Y)^2), it would be a waste of computation
                sum_squares = tf.reduce_sum(tf.square(tf.subtract( rep_points, rep_centroids) ), axis = 2)

                # Use argmin to select the lowest-distance point
                # This gets a matrix of size N x 1
                best_centroids = tf.argmin(sum_squares, axis = 1)
            
                means = []
                for c in range(K):
                    means.append(
                        tf.reduce_mean(
                            tf.gather(X, tf.reshape(tf.where(tf.equal(best_centroids, c)), [1,-1])), axis=[1]))

                new_centroids = tf.concat(means, 0)
                # print('GPU: ', GPU_name)
                # print('Initial centers ', initial_centers)
                # print('New centroids ', new_centroids.eval())
                
            with tf.device('/cpu:0'):
                y_count = tf.cast(
                    tf.bincount(tf.to_int32(best_centroids), maxlength = K, minlength = K), dtype = tf.float64)
            
                partial_mu =  tf.multiply( tf.transpose(new_centroids), y_count )

                partial_directions.append( y_count )
                partial_values.append( partial_mu )
                
    
    with tf.name_scope('global') :
        with tf.device('/cpu:0') :
            sum_direction = tf.add_n( partial_directions )
            sum_mu = tf.add_n( partial_values )

            rep_sum_direction = tf.reshape(tf.tile(sum_direction, [M]), [M, K])
            new_centers = tf.transpose( tf.div(sum_mu, rep_sum_direction) )

            update_centroid = tf.group( global_centroids.assign(new_centers) )
      
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    for i in range(n_max_iters):
        [result, _] = [global_centroids, update_centroid]
            
    return result
    
    

In [None]:
config = tf.ConfigProto( allow_soft_placement = True )
config.gpu_options.allow_growth = True
config.gpu_options.allocator_type = 'BFC'

init = tf.global_variables_initializer()


with tf.Session(config = config) as sess:
    sess.run(iterator.initializer, feed_dict={data_placeholder: data_X})

    while True:
        try:
            item = sess.run(next_element)
#             item = sess.run(distributed_kmeans(GPU_names, next_element, n_max_iters, sess))
            print(item)
        except tf.errors.OutOfRangeError:
            break