In [4]:
%matplotlib inline
%pylab inline
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 10.0)
from tsne import bh_sne # thi is the correct tsne to use.  It's the one discussed btnw
import sklearn.manifold
from matplotlib import pyplot as plt
import pandas
import scipy
import numpy as np
import os
import gc
from time import gmtime, strftime
import seaborn as sns
from os import listdir
from os.path import isfile, join
import math
from scipy.stats.stats import pearsonr
import random as rand
from sklearn.preprocessing import normalize
from collections import defaultdict
def memo(f):
    memo = {}
    def helper(x):
        if x not in memo:            
            memo[x] = f(x)
        return memo[x]
    return helper

def zero_to_one(array):
    array = array - np.min(array)
    array = array/np.max(array)
    return np.nan_to_num(array)


@memo
def load_dataset(path, scale=True):
    gc.collect()
    data = pandas.read_hdf(path, '/df')
    df = pandas.DataFrame(data)
    if scale:
        for label in df._get_numeric_data().columns:
            if label != 'hcad':
                df[label] = df[label].astype(float)
                df[label] = zero_to_one(df[label])
                df[label][df[label] > 1] = 1.0
    df['hcad'] = df['hcad'].astype(int)
    df = df.replace([np.inf, -np.inf], 1)
    
    return df.sort(['hcad']).fillna(0)
    
    


def tsne(df_data, dest_folder, n = None, file_tag= "", embedded_dimensions=2, perplexity = 50):
    result_2d = {}
    result_2d['hcad'] = df_data['hcad'][:n]
    df_data = df_data.drop('hcad', 1) # don't embed the hcad number!
    df_data = np.array(df_data)[:n]
    embedding = bh_sne(np.array(df_data)[:n], perplexity=perplexity)
#     embedding = bh_sne(np.array(df_data))

    result_2d['x'] = zero_to_one(embedding[:, 0])
    result_2d['y'] = zero_to_one(embedding[:, 1])
    result_2d = pandas.DataFrame.from_dict(result_2d)
    #name = file_tag+"_"+"_".join(df_data.columns)[:40] + "_n:"+str(len(result))
    #result.to_pickle(dest_folder+name)
    return embedding

def hist_2d(vis_x,vis_y):
    hh, locx, locy = scipy.histogram2d(vis_x, vis_y, bins=[200,200])
    fig = plt.figure(frameon=False)
    fig.set_size_inches(30,30)
    plt.imshow(np.flipud(hh.T),cmap='jet', interpolation='none', shape = (1,1))
    plt.colorbar()
    
def get_where_img0_is_1(pddf):
    img0_metadata = (META.loc[META['img0'] == 1])
    return pddf.loc[pddf['hcad'].isin(list(img0_metadata['hcad']))]

def pairwise_plot(pddf, sqrt = False):
    if sqrt:
        pddf = np.sqrt(pddf)
    axes = pandas.tools.plotting.scatter_matrix(pddf, alpha=0.2)
    plt.tight_layout()
    plt.show()
    

def fast_show_ratio_plot(xy_points, y_data, log = False, normalize_buckets=True):
    if log:
        y_data = np.log(y_data)
    fig = plt.figure(frameon=False)
    fig.set_size_inches(3,3)
    plt.hist(y_data)
    plt.show()

    buckets = defaultdict(list)
    resolution = 200
    x = np.array(xy_points['x'])
    y = np.array(xy_points['y'])
    H, xedges, yedges = numpy.histogram2d(x,y, bins=resolution, weights = y_data)
    H_nums, dummy2, dummy1 = numpy.histogram2d(x,y, bins=resolution)
    plt.show()
    fig = plt.figure(frameon=False)
    fig.set_size_inches(12,12)
    if normalize_buckets:
        H=H/H_nums
    H[H_nums == 0.0] = numpy.nan
#     if log:
#         H = np.log(H)
    

    plt.imshow(H, 
               interpolation='nearest', cmap=cm.gist_rainbow)
    plt.colorbar()
    plt.show()
    return np.nan_to_num(H)

def colored_scatter(xy_points, y_data):
        fig = plt.figure(frameon=False)
        fig.set_size_inches(20,20)
        plt.scatter(xy_points['x'], xy_points['y'], c=y_data,  marker='x', facecolor='b', cmap='jet')
        plt.colorbar()
        plt.show()
        
def load_mega_hcad():
    # hcad = load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df.hd")
    hcad_data = [load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df_100.hd"),
     load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df_200.hd"),
     load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df_400.hd"),
    load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df_1000.hd")]

    mega_hcad = {}

    for column in hcad_data[0]:
        for index, dataset in enumerate(hcad_data):
            mega_hcad[column+"_"+str(index)] = dataset[column]
    mega_hcad = pandas.DataFrame.from_dict(mega_hcad).as_matrix()
    y_data_np = Y_DATA.as_matrix()
    X_train = np.expand_dims(mega_hcad[:600000], axis=1)
    y_train = y_data_np[:600000, 1]
    print("y train",y_train.shape)
    X_val = np.expand_dims(mega_hcad[600000:700000], axis=1)
    y_val = y_data_np[600000:700000, 1]
    X_test = np.expand_dims(mega_hcad[700000:], axis=1)
    y_test = y_data_np[700000:, 1]
    return X_train, y_train, X_val, y_val, X_test, y_test

# tsne_embed = pandas.read_pickle("/home/isaac/Desktop/devika/gitignored/img1_hcad/_mean_accrued_depr_pct_std_accrued_depr_p_n:104878")
# hist_2d(np.array(tsne_embed['x']),np.array(tsne_embed['y']))



Populating the interactive namespace from numpy and matplotlib


### load all the data at 200m

In [6]:
hcad = load_dataset("/home/isaac/Dropbox/data_for_brian/hcad_features/hcad_df_200.hd")
# hcad = hcad[['hcad', 'mean_accrued_depr_pct', 'mean_bld_val', 'mean_land_val','mean_quality','mean_rcnld', 'mean_tot_mkt_val','mean_year_built','mean_year_remodeled']]
META = load_dataset("/home/isaac/Dropbox/data_for_brian/meta/df_meta.hd")
WIND = load_dataset("/home/isaac/Dropbox/data_for_brian/wind_features/hcad_interp_withoutpartial_rad200_hist16x16.mat.hd")
TERRAIN = load_dataset("/home/isaac/Dropbox/data_for_brian/terrain_features/dsmgrid/terrain_200.hd")

Y_DATA = load_dataset("/home/isaac/Dropbox/data_for_brian/y_df.hd")
img0_y_data = get_where_img0_is_1(Y_DATA)


# @memo
img0_terrain_data = get_where_img0_is_1(TERRAIN)
img0_wind_data = get_where_img0_is_1(WIND)
img0_hcad_data = get_where_img0_is_1(hcad)
img0_metadata = (META.loc[META['img0'] == 1])
# print get_where_img0_is_1(WIND)
def plot_on_map(pddf, meta = META):
    for col in pddf.columns:
        print("\n\n\n",col)
        xy = pandas.DataFrame.from_dict({'x': -meta['pointx'],'y': meta['pointy']})
        print("linear plot")
        fast_show_ratio_plot(xy,np.array(pddf[col]))
        print("log plot")
        fast_show_ratio_plot(xy,np.array(pddf[col]), log = True)
        colored_scatter(xy,np.array(pddf[col]))

Opening /home/isaac/Dropbox/data_for_brian/meta/df_meta.hd in read-only mode
Opening /home/isaac/Dropbox/data_for_brian/wind_features/hcad_interp_withoutpartial_rad200_hist16x16.mat.hd in read-only mode
Opening /home/isaac/Dropbox/data_for_brian/terrain_features/dsmgrid/terrain_200.hd in read-only mode
Opening /home/isaac/Dropbox/data_for_brian/y_df.hd in read-only mode


In [5]:
Y_DATA = load_dataset("/home/isaac/Dropbox/data_for_brian/y_df.hd")


Opening /home/isaac/Dropbox/data_for_brian/y_df.hd in read-only mode


### make an embedding

In [None]:
embedding_2d = tsne(img0_hcad_data ,"/home/isaac/Desktop/devika/gitignored/6_dimensions_hcad_img0/", 
                 file_tag ="hcad_img0", n=None, embedded_dimensions=2)
fast_show_ratio_plot(pandas.DataFrame.from_dict({'x': embedding_2d[:,0],
                                             'y': embedding_2d[:,1]}), np.array(img0_y_data['y200_mean']))
# print embedding_2d

### kmeans cluster the data

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
kmeans = KMeans(init='k-means++', n_clusters=30)
kmeans.fit(embedding_2d)
print kmeans.labels_

# np.random.shuffle(kmeans.labels_)
colored_scatter(pandas.DataFrame.from_dict({'x': - img0_metadata ['pointx'][:len(kmeans.labels_)],
                                            'y': img0_metadata['pointy'][:len(kmeans.labels_)]}), kmeans.labels_)
fast_show_ratio_plot(pandas.DataFrame.from_dict({'x': - img0_metadata ['pointx'][:len(kmeans.labels_)],
                                                 'y': img0_metadata['pointy'][:len(kmeans.labels_)]}),kmeans.labels_)

for label in range(max(kmeans.labels_)+1):
    print "class", label, "damage:",np.mean(np.array(img0_y_data['y200_mean'])[numpy.where(kmeans.labels_==label)])

In [None]:
print pandas.read_pickle("/home/isaac/Desktop/devika/gitignored/testing_refactored_code/_mean_accrued_depr_pct_std_accrued_depr_p_n:100")

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
# print digits

In [None]:
images = digits['images']
tsne(images.reshape(1797,64), "/home/isaac/Desktop/devika/gitignored/digits_test/")

In [None]:
# print np.array(tsne_embed['x'])[0]
# show_ratio_plot(tsne_embed, hcad['mean_bld_val'])
# print get_meta()
# print Y_DATA

plot_on_map(img0_hcad_data, meta = img0_metadata)

 ### create a [point, 100m, 200m, ... , point, 100m, 200m, ...] data table

In [7]:


# print load_mega_hcad()[0].shape
import theano
import theano.tensor as T
# theano.config.optimizer='fast_compile'
# theano.config.exception_verbosity='high'

import lasagne
from __future__ import print_function

import sys
import os
import time



def build_cnn(input_var=None):
    # As a third model, we'll create a CNN of two convolution + pooling stages
    # and a fully-connected hidden layer in front of the output layer.

    # Input layer, as usual:
    network = lasagne.layers.InputLayer(shape=(None,1, 260),
                                        input_var=input_var)
    # This time we do not apply input dropout, as it tends to work less well
    # for convolutional layers.

    # Convolutional layer with 32 kernels of size 5x5. Strided and padded
    # convolutions are supported as well; see the docstring.
    network = lasagne.layers.Conv1DLayer(
            network, num_filters=32, filter_size=5,
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform())
    # Expert note: Lasagne provides alternative convolutional layers that
    # override Theano's choice of which implementation to use; for details
    # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html.

    # Max-pooling layer of factor 2 in both dimensions:
#     network = lasagne.layers.MaxPool1DLayer(network, pool_size=2)

    # Another convolution with 32 5x5 kernels, and another 2x2 pooling:
#     network = lasagne.layers.Conv2DLayer(
#             network, num_filters=32, filter_size=(5, 5),
#             nonlinearity=lasagne.nonlinearities.rectify)
#     network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    # A fully-connected layer of 256 units with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=50,
            nonlinearity=lasagne.nonlinearities.rectify)

    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=1,
            nonlinearity=lasagne.nonlinearities.softmax)

    return network


# ############################# Batch iterator ###############################
# This is just a simple helper function iterating over training data in
# mini-batches of a particular size, optionally in random order. It assumes
# data is available as numpy arrays. For big datasets, you could load numpy
# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your
# own custom data iteration function. For small datasets, you can also copy
# them to GPU at once for slightly improved performance. This would involve
# several changes in the main program, though, and is not demonstrated here.

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]


# ############################## Main program ################################
# Everything else will be handled in our main program now. We could pull out
# more functions to better separate the code, but it wouldn't make it any
# easier to read.

def main(num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_mega_hcad()
    #print(X_train)
    #print(y_train)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor3('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")

    network = build_cnn(input_var)

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.squared_error(prediction, target_var) ## changed from categorical cross entropy
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
            inputs, targets = batch
            print(inputs.shape)
            print(targets)
            train_err += train_fn(inputs.transpose(), targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

main()

Loading data...
y train (600000,)
Building model and compiling functions...
Starting training...
(500, 1, 260)
[ 1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.
  1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  1.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.
  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.
  0.  0.  0. 

ValueError: GpuElemwise. Input dimension mis-match. Input 1 (indices start at 0) has shape[2] == 256, but the output's size on that axis is 496.
Apply node that caused the error: GpuElemwise{Composite{((i0 + Abs(i0)) * i1)},no_inplace}(GpuElemwise{Add}[(0, 0)].0, GpuElemwise{Composite{Cast{float32}(LT(i0, i1))}}[(0, 0)].0)
Toposort index: 51
Inputs types: [CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D)]
Inputs shapes: [(260, 32, 496), (260, 32, 256)]
Inputs strides: [(15872, 496, 1), (8192, 256, 1)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[GpuFlatten{2}(GpuElemwise{Composite{((i0 + Abs(i0)) * i1)},no_inplace}.0)]]

Debugprint of the apply node: 
GpuElemwise{Composite{((i0 + Abs(i0)) * i1)},no_inplace} [@A] <CudaNdarrayType(float32, 3D)> ''   
 |GpuElemwise{Add}[(0, 0)] [@B] <CudaNdarrayType(float32, 3D)> ''   
 | |GpuSubtensor{::, ::, int64} [@C] <CudaNdarrayType(float32, 3D)> ''   
 | | |Rebroadcast{?,?,1} [@D] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | | |Rebroadcast{?,?,0} [@E] <CudaNdarrayType(float32, 4D)> ''   
 | | |   |GpuDnnConv{algo='small', inplace=True} [@F] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     |GpuContiguous [@G] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     | |GpuDimShuffle{0,1,x,2} [@H] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     |   |GpuFromHost [@I] <CudaNdarrayType(float32, 3D)> ''   
 | | |     |     |inputs [@J] <TensorType(float32, 3D)>
 | | |     |GpuContiguous [@K] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     | |GpuDimShuffle{0,1,x,2} [@L] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     |   |W [@M] <CudaNdarrayType(float32, 3D)>
 | | |     |GpuAllocEmpty [@N] <CudaNdarrayType(float32, (False, False, True, False))> ''   
 | | |     | |Shape_i{0} [@O] <TensorType(int64, scalar)> ''   
 | | |     | | |inputs [@J] <TensorType(float32, 3D)>
 | | |     | |Shape_i{0} [@P] <TensorType(int64, scalar)> ''   
 | | |     | | |W [@M] <CudaNdarrayType(float32, 3D)>
 | | |     | |TensorConstant{1} [@Q] <TensorType(int64, scalar)>
 | | |     | |Elemwise{Composite{((((i0 + i1) - i2) // i3) + i3)}}[(0, 2)] [@R] <TensorType(int64, scalar)> ''   
 | | |     |   |Shape_i{2} [@S] <TensorType(int64, scalar)> ''   
 | | |     |   | |inputs [@J] <TensorType(float32, 3D)>
 | | |     |   |TensorConstant{0} [@T] <TensorType(int8, scalar)>
 | | |     |   |Shape_i{2} [@U] <TensorType(int64, scalar)> ''   
 | | |     |   | |W [@M] <CudaNdarrayType(float32, 3D)>
 | | |     |   |TensorConstant{1} [@V] <TensorType(int8, scalar)>
 | | |     |GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv'} [@W] <CDataType{cudnnConvolutionDescriptor_t}> ''   
 | | |     | |MakeVector{dtype='int64'} [@X] <TensorType(int64, vector)> ''   
 | | |     | | |Shape_i{0} [@O] <TensorType(int64, scalar)> ''   
 | | |     | | |Shape_i{1} [@Y] <TensorType(int64, scalar)> ''   
 | | |     | | | |inputs [@J] <TensorType(float32, 3D)>
 | | |     | | |TensorConstant{1} [@Q] <TensorType(int64, scalar)>
 | | |     | | |Shape_i{2} [@S] <TensorType(int64, scalar)> ''   
 | | |     | |MakeVector{dtype='int64'} [@Z] <TensorType(int64, vector)> ''   
 | | |     |   |Shape_i{0} [@P] <TensorType(int64, scalar)> ''   
 | | |     |   |Shape_i{1} [@BA] <TensorType(int64, scalar)> ''   
 | | |     |   | |W [@M] <CudaNdarrayType(float32, 3D)>
 | | |     |   |TensorConstant{1} [@Q] <TensorType(int64, scalar)>
 | | |     |   |Shape_i{2} [@U] <TensorType(int64, scalar)> ''   
 | | |     |Constant{1.0} [@BB] <float32>
 | | |     |Constant{0.0} [@BC] <float32>
 | | |Constant{0} [@BD] <int64>
 | |GpuDimShuffle{x,0,x} [@BE] <CudaNdarrayType(float32, (True, False, True))> ''   
 |   |b [@BF] <CudaNdarrayType(float32, vector)>
 |GpuElemwise{Composite{Cast{float32}(LT(i0, i1))}}[(0, 0)] [@BG] <CudaNdarrayType(float32, 3D)> ''   
   |GPU_mrg_uniform{CudaNdarrayType(float32, 3D),inplace}.1 [@BH] <CudaNdarrayType(float32, 3D)> ''   
   | |<CudaNdarrayType(float32, vector)> [@BI] <CudaNdarrayType(float32, vector)>
   | |Elemwise{Cast{int32}} [@BJ] <TensorType(int32, vector)> ''   
   |   |MakeVector{dtype='int64'} [@BK] <TensorType(int64, vector)> ''   
   |     |Shape_i{0} [@O] <TensorType(int64, scalar)> ''   
   |     |TensorConstant{32} [@BL] <TensorType(int64, scalar)>
   |     |TensorConstant{256} [@BM] <TensorType(int64, scalar)>
   |CudaNdarrayConstant{[[[ 0.5]]]} [@BN] <CudaNdarrayType(float32, (True, True, True))>

Storage map footprint:
 - GpuElemwise{Add}[(0, 0)].0, Shape: (260, 32, 496), ElemSize: 4 Byte(s), TotalSize: 16506880 Byte(s)
 - GpuElemwise{Composite{Cast{float32}(LT(i0, i1))}}[(0, 0)].0, Shape: (260, 32, 256), ElemSize: 4 Byte(s), TotalSize: 8519680 Byte(s)
 - <CudaNdarrayType(float32, matrix)>, Shared Input, Shape: (8192, 50), ElemSize: 4 Byte(s), TotalSize: 1638400 Byte(s)
 - W, Shared Input, Shape: (8192, 50), ElemSize: 4 Byte(s), TotalSize: 1638400 Byte(s)
 - GpuFromHost.0, Shape: (260, 1, 500), ElemSize: 4 Byte(s), TotalSize: 520000 Byte(s)
 - inputs, Input, Shape: (260, 1, 500), ElemSize: 4 Byte(s), TotalSize: 520000 Byte(s)
 - <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (92160,), ElemSize: 4 Byte(s), TotalSize: 368640 Byte(s)
 - <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (92160,), ElemSize: 4 Byte(s), TotalSize: 368640 Byte(s)
 - GPU_mrg_uniform{CudaNdarrayType(float32, 3D),inplace}.0, Shape: (92160,), ElemSize: 4 Byte(s), TotalSize: 368640 Byte(s)
 - targets, Input, Shape: (500,), ElemSize: 4 Byte(s), TotalSize: 2000 Byte(s)
 - W, Shared Input, Shape: (32, 1, 5), ElemSize: 4 Byte(s), TotalSize: 640 Byte(s)
 - <CudaNdarrayType(float32, 3D)>, Shared Input, Shape: (32, 1, 5), ElemSize: 4 Byte(s), TotalSize: 640 Byte(s)
 - b, Shared Input, Shape: (50,), ElemSize: 4 Byte(s), TotalSize: 200 Byte(s)
 - W, Shared Input, Shape: (50, 1), ElemSize: 4 Byte(s), TotalSize: 200 Byte(s)
 - <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (50,), ElemSize: 4 Byte(s), TotalSize: 200 Byte(s)
 - <CudaNdarrayType(float32, matrix)>, Shared Input, Shape: (50, 1), ElemSize: 4 Byte(s), TotalSize: 200 Byte(s)
 - <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (32,), ElemSize: 4 Byte(s), TotalSize: 128 Byte(s)
 - b, Shared Input, Shape: (32,), ElemSize: 4 Byte(s), TotalSize: 128 Byte(s)
 - MakeVector{dtype='int64'}.0, Shape: (4,), ElemSize: 8 Byte(s), TotalSize: 32 Byte(s)
 - MakeVector{dtype='int64'}.0, Shape: (3,), ElemSize: 8 Byte(s), TotalSize: 24 Byte(s)
 - TensorConstant{1}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Subtensor{int64}.0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Subtensor{int64}.0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Shape_i{0}.0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Shape_i{1}.0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Shape_i{2}.0, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{-1}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Constant{1}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - Elemwise{Composite{(inv(i0) / i1)}}.0, Shape: (1, 1), ElemSize: 8 Byte(s), TotalSize: 8 Byte(s)
 - Constant{0}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - TensorConstant{256}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - TensorConstant{32}, Shape: (), ElemSize: 8 Byte(s), TotalSize: 8.0 Byte(s)
 - TensorConstant{(1, 1) of 2.0}, Shape: (1, 1), ElemSize: 8 Byte(s), TotalSize: 8 Byte(s)
 - Constant{0.0}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
 - CudaNdarrayConstant{[ 0.89999998]}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - b, Shared Input, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - <CudaNdarrayType(float32, vector)>, Shared Input, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[[[ 0.01]]]}, Shape: (1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[[[[ 0.]]]]}, Shape: (1, 1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - Constant{1.0}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
 - TensorConstant{0.00999999977648}, Shape: (), ElemSize: 4 Byte(s), TotalSize: 4.0 Byte(s)
 - CudaNdarrayConstant{[[[ 0.5]]]}, Shape: (1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[[ 0.89999998]]}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[ 0.01]}, Shape: (1,), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[[ 0.5]]}, Shape: (1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - CudaNdarrayConstant{[[[ 0.89999998]]]}, Shape: (1, 1, 1), ElemSize: 4 Byte(s), TotalSize: 4 Byte(s)
 - TensorConstant{1}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 - TensorConstant{0}, Shape: (), ElemSize: 1 Byte(s), TotalSize: 1.0 Byte(s)
 TotalSize: 30085190.0 Byte(s) 0.028 GB
 TotalSize inputs: 4538526.0 Byte(s) 0.004 BG

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.

In [None]:
from __future__ import print_function

import sys
import os
import time

import numpy as np
import theano
import theano.tensor as T

import lasagne




def build_cnn(input_var=None):
    # As a third model, we'll create a CNN of two convolution + pooling stages
    # and a fully-connected hidden layer in front of the output layer.

    # Input layer, as usual:
    network = lasagne.layers.InputLayer(shape=(None, 1, 260, 1),
                                        input_var=input_var)
    # This time we do not apply input dropout, as it tends to work less well
    # for convolutional layers.

    # Convolutional layer with 32 kernels of size 5x5. Strided and padded
    # convolutions are supported as well; see the docstring.
    network = lasagne.layers.Conv2DLayer(
            network, num_filters=32, filter_size=(5, 1),
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform())
    # Expert note: Lasagne provides alternative convolutional layers that
    # override Theano's choice of which implementation to use; for details
    # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html.

    # Max-pooling layer of factor 2 in both dimensions:
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 1))

    # Another convolution with 32 5x5 kernels, and another 2x2 pooling:
    network = lasagne.layers.Conv2DLayer(
            network, num_filters=32, filter_size=(5, 1),
            nonlinearity=lasagne.nonlinearities.rectify)
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 1))

    # A fully-connected layer of 256 units with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=256,
            nonlinearity=lasagne.nonlinearities.rectify)

    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(
            lasagne.layers.dropout(network, p=.5),
            num_units=10,
            nonlinearity=lasagne.nonlinearities.softmax)

    return network


def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
    print(y_train.shape)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)
        return

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
            inputs, targets = batch
            print(inputs.shape)
            print(targets)

            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(
        test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
    #
    # And load them again later on like this:
    # with np.load('model.npz') as f:
    #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    # lasagne.layers.set_all_param_values(network, param_values)


main()


In [11]:
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.nonlinearities import softmax
from nolearn.lasagne import NeuralNet
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_regression
X_train, y_train, X_val, y_val, X_test, y_test = load_mega_hcad()

def classif(X, y):
    l = InputLayer(shape=(None, X.shape[1]))
    l = DenseLayer(l, num_units=len(np.unique(y)), nonlinearity=softmax)
    net = NeuralNet(l, update_learning_rate=0.01, input_shape=(None, X.shape[1]))
    net.fit(X, y)
    print(net.score(X, y))
classif(np.array(X_train), np.array(y_train))

('y train', (600000,))


TypeError: 'DenseLayer' object does not support indexing

### housing density - correlation with damage?  Apparently not.

In [None]:
density = fast_show_ratio_plot(pandas.DataFrame.from_dict({'x': -META ['pointx'],
                                             'y': META ['pointy']}), np.ones(len(META)), normalize_buckets = False)

y_shuffle = np.copy(np.array(Y_DATA['y200_mean']))
np.random.shuffle(y_shuffle)
damage = fast_show_ratio_plot(pandas.DataFrame.from_dict({'x': -META ['pointx'],
                                             'y': META ['pointy']}), np.array(Y_DATA['y200_mean']))
damage = damage[density != 0]

density = density[density != 0]
print density.flatten()
damage.flatten()
print pearsonr(density.flatten(), damage.flatten())