# The functions in this file are used to generate datasets for machine-learning problems.


In [2]:
import tensorflow as tf
import numpy as np
import copy
import os  # For starting up tensorboard from inside python
import matplotlib.pyplot as PLT
import scipy.cluster.hierarchy as SCH  # Needed for dendrograms
import numpy.random as NPR

## SESSION HANDLING

In [None]:
def gen_initialized_session(dir='probeview'):
    sess = tf.Session()
    sess.probe_stream = viewprep(sess,dir=dir)  # Create a probe stream and attach to the session
    sess.viewdir = dir  # add a second slot, viewdir, to the session
    sess.run(tf.global_variables_initializer())
    return sess

In [None]:
def copy_session(sess1):
    sess2 = tf.Session()
    sess2.probe_stream = sess1.probe_stream
    sess2.probe_stream.reopen()
    sess2.viewdir = sess1.viewdir
    return sess2

In [None]:
# Simple evaluator of a TF operator.
def tfeval(operators):
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    result = sess.run(operators) # result = a list of output values, one from each operator.
    sess.close()
    return result

# TENSORBOARD SUPPORT

In [None]:
# This creates the main data for tensorboard viewing: the graph and variable histories.
def viewprep(session, dir='probeview',flush=120,queue=10):
    clear_tensorflow_log(dir)  # Without this, the directory fills up with unusable files
    return tf.summary.FileWriter(dir,session.graph,flush_secs=flush,max_queue=queue)

To view probes, the function graph, etc., do this at the command line:
	$ tensorboard --logdir=probeview
Then open a Chrome browser and go to site:  localhost:6006


In [None]:
def gen_segmented_vector_cases(vectorlen,count,minsegs,maxsegs,poptargs=True):
    cases = []
    for c in range(count):
        numsegs = NPR.randint(minsegs,maxsegs+1)
        v = gen_segmented_vector(vectorlen,numsegs)
        case = [v,int_to_one_hot(numsegs,maxsegs-minsegs+1)] if poptargs else [v,numsegs]
        cases.append(case)
    return cases

# PRIMITIVE DATA VIEWING

In [None]:
def segment_count(vect,onval=1,offval=0):
    lastval = offval; count = 0
    for elem in vect:
        if elem == onval and lastval == offval: count += 1
        lastval = elem
    return count

In [3]:
# This produces a string consisting of the binary vector followed by the segment count surrounded by a few symbols
# and/or blanks.  These strings are useful to use as labels during dendrogram plots, for example.
def segmented_vector_string(v,pre='** ',post=' **'):
    def binit(vect): return map((lambda x: 1 if x > 0 else 0), vect)
    return ''.join(map(str, binit(v))) + pre + str(segment_count(v)) + post

# PRIMITIVE DATA VIEWING

In [4]:
def show_results(grabbed_vals,grabbed_vars=None,dir='probeview'):
    showvars(grabbed_vals,names = [x.name for x in grabbed_vars], msg="The Grabbed Variables:")

In [5]:
def showvars(vals,names=None,msg=""):
    print("\n"+msg,end="\n")
    for i,v in enumerate(vals):
        if names: print("   " + names[i] + " = ",end="\n")
        print(v,end="\n\n")

In [6]:
# Very simple printing of a matrix using the 'style' format for each element.
def pp_matrix(m,style='{:.3f}'):
    rows, cols = m.shape
    for r in range(rows):
        print()  # skips to next line
        for c in range(cols): print(style.format(m[r][c]), end=' ')
    print()

#  DATA PLOTTING ROUTINES 

In [8]:
def simple_plot(yvals,xvals=None,xtitle='X',ytitle='Y',title='Y = F(X)'):
    xvals = xvals if xvals is not None else list(range(len(yvals)))
    PLT.plot(xvals,yvals)
    PLT.xlabel(xtitle); PLT.ylabel(ytitle); PLT.title(title)
    PLT.draw()

In [9]:
# Each history is a list of pairs (timestamp, value).
def plot_training_history(error_hist,validation_hist=[],xtitle="Epoch",ytitle="Error",title="History",fig=True):
    PLT.ion()
    if fig: PLT.figure()
    if len(error_hist) > 0:
        simple_plot([p[1] for p in error_hist], [p[0] for p in error_hist],xtitle=xtitle,ytitle=ytitle,title=title)
        PLT.hold(True)
    if len(validation_hist) > 0:
        simple_plot([p[1] for p in validation_hist], [p[0] for p in validation_hist])
    PLT.ioff()

In [10]:
# alpha = transparency
def simple_scatter_plot(points,alpha=0.5,radius=3):
    colors = ['red','green','blue','magenta','brown','yellow','orange','brown','purple','black']
    a = np.array(points).transpose()
    PLT.scatter(a[0],a[1],c=colors,alpha=alpha,s=np.pi*radius**2)
    PLT.draw()

This is Hinton's classic plot of a matrix (which may represent snapshots of weights or a time series of activation values).  Each value is represented by a red (positive) or blue (negative) square whose size reflects the absolute value.  This works best when maxsize is hardwired to 1.  The transpose (trans) arg defaults to true so that matrices are plotted with rows along a horizontal plane, with the 0th row on top.

The 'colors' argument, a list, is ordered as follows: background, positive-value, negative-value, box-edge. If you do not want to draw box edges, just use 'None' as the 4th color.  A gray-scale combination that mirrors Hinton's original version is ['gray','white','black',None]


In [12]:
def hinton_plot(matrix, maxval=None, maxsize=1, fig=None,trans=True,scale=True, title='Hinton plot',
                colors=['gray','red','blue','white']):
    hfig = fig if fig else PLT.figure()
    hfig.suptitle(title,fontsize=18)
    if trans: matrix = matrix.transpose()
    if maxval == None: maxval = np.abs(matrix).max()
    if not maxsize: maxsize = 2**np.ceil(np.log(maxval)/np.log(2))

    axes = hfig.gca()
    axes.clear()
    axes.patch.set_facecolor(colors[0]);  # This is the background color.  Hinton uses gray
    axes.set_aspect('auto','box')  # Options: ('equal'), ('equal','box'), ('auto'), ('auto','box')..see matplotlib docs
    axes.xaxis.set_major_locator(PLT.NullLocator()); axes.yaxis.set_major_locator(PLT.NullLocator())

    ymax = (matrix.shape[1] - 1)* maxsize
    for (x, y), val in np.ndenumerate(matrix):
        color = colors[1] if val > 0 else colors[2]  # Hinton uses white = pos, black = neg
        if scale: size = max(0.01,np.sqrt(min(maxsize,maxsize*np.abs(val)/maxval)))
        else: size = np.sqrt(min(np.abs(val),maxsize))  # The original version did not include scaling
        bottom_left = [x - size / 2, (ymax - y) - size / 2] # (ymax - y) to invert: row 0 at TOP of diagram
        blob = PLT.Rectangle(bottom_left, size, size, facecolor=color, edgecolor=colors[3])
        axes.add_patch(blob)
    axes.autoscale_view()
    PLT.draw()
    PLT.pause(.001)

This graphically displays a matrix with color codes for positive, negative, small positive and small negative, with the latter 2 defined by the 'cutoff' argument.  The transpose (trans) arg defaults to True so that matrices are plotted with rows along a horizontal plane, with the 0th row on top. Colors denote: [positive, small positive, small negative, negative]

In [13]:
def display_matrix(matrix,fig=None,trans=True,scale=True, title='Matrix',tform='{:.3f}',tsize=12,
                   cutoff=0.1,colors=['red','yellow','grey','blue']):
    hfig = fig if fig else PLT.figure()
    hfig.suptitle(title,fontsize=18)
    if trans: matrix = matrix.transpose()
    axes = hfig.gca()
    axes.clear()
    axes.patch.set_facecolor('white');  # This is the background color.  Hinton uses gray
    axes.set_aspect('auto','box')  # Options: ('equal'), ('equal','box'), ('auto'), ('auto','box')..see matplotlib docs
    axes.xaxis.set_major_locator(PLT.NullLocator()); axes.yaxis.set_major_locator(PLT.NullLocator())

    ymax = matrix.shape[1] - 1
    for (x, y), val in np.ndenumerate(matrix):
        if val > 0: color = colors[0] if val > cutoff else colors[1]
        else: color = colors[3] if val < -cutoff else colors[2]
        botleft = [x - 1/2, (ymax - y) - 1/2] # (ymax - y) to invert: row 0 at TOP of diagram
        # This is a hack, but I seem to need to add these blank blob rectangles first, and then I can add the text
        # boxes.  If I omit the blobs, I get just one plotted textbox...grrrrrr.
        blob = PLT.Rectangle(botleft, 1,1, facecolor='white',edgecolor='white')
        axes.add_patch(blob)
        axes.text(botleft[0]+0.5,botleft[1]+0.5,tform.format(val),
                  bbox=dict(facecolor=color,alpha=0.5,edgecolor='white'),ha='center',va='center',
                  color='black',size=tsize)
    axes.autoscale_view()
    PLT.draw()
    PLT.pause(1)

# Principle Component Analysis (PCA)
This performs the basic operations outlined in "Python Machine Learning" (pp.128-135).  It begins with
an N x K array whose rows are cases and columns are features.  It then computes the covariance matrix (of features),
which is then used to compute the eigenvalues and eigenvectors.  The eigenvectors corresponding to the largest
(absolute value) eigenvalues are then combined to produce a transformation matrix, which is applied to the original
N cases to produce N new cases, each with J (ideally J << K) features.  This is UNSUPERVISED dimension reduction.


In [14]:
def pca(features,target_size,bias=True,rowvar=False):
    farray = features if isinstance(features,np.ndarray) else np.array(features)
    cov_mat = np.cov(farray,rowvar=rowvar,bias=bias) # rowvar=False => each var's values are in a COLUMN.
    eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
    return gen_dim_reduced_data(farray,target_size,eigen_vals, eigen_vecs)

Use the highest magnitude eigenvalues (and their eigenvectors) as the basis for feature-vector transformations that reduce the dimensionality of the data.  feature_array is N x M, where 
	N = # cases 
	M = # features


In [15]:
def gen_dim_reduced_data(feature_array,target_size,eigen_values,eigen_vectors):
    eigen_pairs = [(np.abs(eigen_values[i]),eigen_vectors[:,i]) for i in range(len(eigen_values))]
    eigen_pairs.sort(key=(lambda p: p[0]),reverse=True)  # Sorts tuples by their first element = abs(eigenvalue)
    best_vectors = [pair[1] for pair in eigen_pairs[ : target_size]]
    w_transform = np.array(best_vectors).transpose()
    return np.dot(feature_array,w_transform)

# DENDROGRAM

In [16]:
# Options:
# orientation = top, bottom, left, right (refers to location of the root of the tree)
# mode = single, average, complete, centroid, ward, median
# metric = euclidean, cityblock (manhattan), hamming, cosine, correlation ... (see matplotlib distance.pdist for all 23)
def dendrogram(features,labels,metric='euclidean',mode='average',ax=None,title='Dendrogram',orient='top',lrot=90.0):
    ax = ax if ax else PLT.gca()
    cluster_history = SCH.linkage(features,method=mode,metric=metric)
    SCH.dendrogram(cluster_history,labels=labels,orientation=orient,leaf_rotation=lrot)
    PLT.tight_layout()
    ax.set_title(title)
    ax.set_ylabel(metric + ' distance')
    PLT.show()