# Create the full feature vector dataframe 
## Optimised to broadcast joins and trying to avoid data spills to disk in the executiors

### First! we set up the Spark Context

In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '6g'),\
                           ('spark.driver.memory', '10g'),\
                           ('spark.driver.maxResultSize', 0), \
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50), \
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.sql.autoBroadcastJoinThreshold', 52428800), \
                           ('spark.executor.cores', 4),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','FullVectorTesting - JupyterHub version')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('LoadD1_Pictures_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

In [4]:
import gc # manual garbag collection to stop leaks on Collect() gc.collect()
import pandas as pd

In [5]:
#from keras.layers import *
#from keras.models import Model, load_model
#from keras.optimizers import Adam, Nadam, SGD
#from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
#from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
pgm_start=time.time()
pgm_startCpu=time.clock()

In [7]:
classes = np.array([6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99], dtype='int32')
class_names = ['class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']
class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1, 99: 1}

# LSST passbands (nm)  u    g    r    i    z    y      
passbands = np.array([357, 477, 621, 754, 871, 1004], dtype='float32')

In [8]:
augment_count = 35
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [9]:
sqlContext = SQLContext(sc)

In [10]:
sqlContext.sql("use plasticc")

DataFrame[]

In [11]:
## Get the augmented vector training set
vectorTable="training_set_augmented_vectors"
trainingVectorsDF=sqlContext.sql("select * from {}".format(vectorTable)).persist()

## get some data on the dataframe

In [12]:
from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
def _to_java_object_rdd(rdd):  
    """ Return a JavaRDD of Object by unpickling
    It will convert each Python object into Java object by Pyrolite, whenever the
    RDD is serialized in batch or not.
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))
    return rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(rdd._jrdd, True)


In [13]:
JavaObj = _to_java_object_rdd(trainingVectorsDF.rdd)
nbytes = sc._jvm.org.apache.spark.util.SizeEstimator.estimate(JavaObj)
nbytes/10**6

83.763792

In [14]:
trainingVectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- meta: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- target: integer (nullable = true)
 |-- specz: double (nullable = true)
 |-- band: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: double (containsNull = true)
 |-- hist: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- interval: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |-- deltaMjd: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |-- rval: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |    |-- flux: array (nullable = true)
 |    |    |    |-- element:

In [15]:
trainingVectorsDF.explain()

== Physical Plan ==
InMemoryTableScan [object_id#0, meta#1, target#2, specz#3, band#4, hist#5]
   +- InMemoryRelation [object_id#0, meta#1, target#2, specz#3, band#4, hist#5], true, 10000, StorageLevel(disk, memory, 1 replicas)
         +- *(1) FileScan parquet plasticc.training_set_augmented_vectors[object_id#0,meta#1,target#2,specz#3,band#4,hist#5] Batched: false, Format: Parquet, Location: InMemoryFileIndex[hdfs://athena-1.nimbus.pawsey.org.au:8020/user/hive/warehouse/plasticc.db/train..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<object_id:int,meta:array<double>,target:int,specz:double,band:array<array<double>>,hist:ar...


## Set up the training, test and validation splits
We'll do this on the metadata table as we use this table for the master joins

In [16]:
weights = [.8, .1, .1]
seed = 42 # seed=0L
train_df, validation_df, test_df = trainingVectorsDF.randomSplit(weights, seed)

In [17]:
idArr=np.array(train_df.select('object_id').collect(), dtype='float32')

In [18]:
r,c=idArr.shape
idArr.reshape(r,)
meta_len=10

In [19]:
metaArr=np.array(train_df.select('meta').collect(), dtype='float32').reshape(r,meta_len)
bandArr= np.array(train_df.select('band').collect() , dtype='int32').reshape(r,sequence_len)

histArray=np.zeros((r,sequence_len,8), dtype='float32') 
# this will work brilliantly as get_keras_data sets three columns to zeros anyway

#mjdInt=np.array(vectors_df.select('hist.interval').collect(), dtype='float32').reshape(r,sequence_len)
deltaMjd=np.array(train_df.select('hist.deltaMjd').collect(), dtype='float32').reshape(r,sequence_len)
rval=np.array(train_df.select('hist.rval').collect(), dtype='float32').reshape(r,sequence_len)
fluxTest=np.array(train_df.select('hist.flux').collect(), dtype='float32').reshape(r,sequence_len)
flux_err_test=np.array(train_df.select('hist.flux_err').collect(), dtype='float32').reshape(r,sequence_len)
#detected=np.array(train_df.select('hist.detected').collect(), dtype='float32').reshape(r,sequence_len)
source_wavelength=np.array(train_df.select('hist.source_wavelength').collect(), dtype='float32').reshape(r,sequence_len)
#received_wavelength=np.array(train_df.select('hist.received_wavelength').collect(), dtype='float32').reshape(r,sequence_len)


#### as per the baseline program, we remove the abs time, detected and receoved_wavelength data

In [20]:

#histArray[:,:,0]=mjdInt
histArray[:,:,1]=fluxTest
histArray[:,:,2]=flux_err_test
#histArray[:,:,3]=detected
histArray[:,:,4]=deltaMjd
histArray[:,:,5]=rval
histArray[:,:,6]=source_wavelength
#histArray[:,:,7]=received_wavelength

In [21]:
# Create the final vector dictionary
X = {
        'id': idArr,
        'meta': metaArr,
        'band': bandArr,
        'hist': histArray
    }

In [22]:
Y = to_categorical(np.array(train_df.select('target').collect(), dtype='int32'), num_classes=len(classes))

In [23]:
pgm_elapsed=time.time() - pgm_start
pgm_elapsedCpu=time.clock() - pgm_startCpu

In [24]:
print(pgm_elapsed)
print(pgm_elapsedCpu)

254.39227175712585
124.59657700000001


# Split testing below. you can delete after we work it out

https://stackoverflow.com/questions/37077432/how-to-estimate-dataframe-real-size-in-pyspark