# Create the full test vector dataframe using Spark temp tables

## First! we set up the Spark Context

In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '4g'),\
                           ('spark.driver.memory', '4g'),\
                           ('spark.driver.maxResultSize', 0), \
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50), \
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.executor.cores', 4),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','Creating training set vectors - JupyterHub version')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('Create_vectors_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

In [4]:
import gc # manual garbag collection to stop leaks on Collect() gc.collect()

In [5]:
pgm_start=time.time()
pgm_startCpu=time.clock()

In [6]:
augment_count = 35
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [7]:
sqlContext = SQLContext(sc)

In [8]:
sqlContext.sql("use plasticc")

DataFrame[]

#### Refer to this great article, SQL at Scale with Apache Spark SQL and DataFrames

https://towardsdatascience.com/sql-at-scale-with-apache-spark-sql-and-dataframes-concepts-architecture-and-examples-c567853a702f

#### Benchmark snippets that may be usefull
https://community.cloudera.com/t5/Community-Articles/Spark-RDDs-vs-DataFrames-vs-SparkSQL/ta-p/246547

## Create the padded training set
We do this to create a standard set of features for all objects; in this case, the value provided by sequence_len (which in this case is 256).

We're also going to use Spark registered tables to do this.

In [9]:
training_set = "training_set"
training_metadata = "training_set_metadata"

This is the SQL equivalent of the keras PAD_SEQUENCES function. 

We create a baseline table consisting of object_ids and a 0 value for every feature as well as a rownumber. This is accomplished by a cartesian join (cross join in Hive) between the cnt nested table and the objects nesed table, resulting the baseline nested table which has 256 records for each object, with all features set to zero.

Next, we create a train_set nested table containing all the training set information, ordered by the mjd value descending. this pads the data from the last value to the first as per PAD_SEQUENCES,

We then create the padded training set as a left join from the baseline nested table to the train_set nested table, padded to a consistent 236 values for each feature.

In [10]:
paddedSQL="""
with
cnt
as
(
    select rownum from 
    (
        select row_number() over (ORDER BY object_id) as rownum
        from {}
    ) a
    where rownum <=256
),
objects as (select object_id, 0 padMJD, 0 padPassband,0 padFlux, 0 padFlux_err,0 padDetected from {} group by object_id),
baseline as (select * from objects CROSS JOIN cnt ), -- cartesian product with 256 values to use as the baseline
train_set as (select *, row_number() over (partition by object_id order by mjd desc) as rownum from {}),
paddedRev as (
    select baseline.object_id, --train_set.mjd, baseline.padMJD,
    case when train_set.mjd is null then baseline.padMJD else train_set.mjd end mjd,
    case when train_set.passband is null then baseline.padPassband else train_set.passband end passband,
    case when train_set.flux is null then baseline.padFlux else train_set.mjd end flux,
    case when train_set.flux_err is null then baseline.padFlux_err else train_set.flux_err end flux_err,
    case when train_set.detected is null then baseline.padDetected else train_set.detected end detected
    from baseline left outer join train_set on baseline.object_id = train_set.object_id and baseline.rownum=train_set.rownum
    order by baseline.object_id, mjd desc
)
select object_id, mjd, passband, flux, flux_err, detected from paddedRev order by object_id, mjd
""".format(training_metadata,training_set,training_set)

Now we create the padded training data dataframe and create a Spark temporary table

In [11]:
paddedTrainingSet_DF = sqlContext.sql(paddedSQL)
paddedTrainingSet_DF.registerTempTable("PADDED_TRAINING_SET")

Get the training set metadata

In [12]:
metadataSQL="""select * from {}""".format(training_metadata)
metadata_DF = sqlContext.sql(metadataSQL)
metadata_DF.registerTempTable("TRAINING_SET_METADATA")

### So, now we have the padded training set and the training set metadata

#### Create the training set padded feature vectors dataframe and instantiate it as a hive table.

Points to note.

- The following SQL could all be inciorporated into one statement, but in the interests of clarity, we have broken it down into relevent component parts
- Calculating the MJD intervals utilises SQL WINDOW functionality. This needs to be used carefully, because wondow functionality will cause SPark to perform a hash sort which is a very expensive operation and on larger datasets can cause a spill to disk which is to be avoided if at all possible. 
- Two separate tables need to be utilised for these sorts, because Hive doesn not support a WINDOW statement on the same field (mjd) with different ORDER BY clauses in the OVER (PARTITION BY ... ORDER BY ...) WINDOW. Hence, we have CTE1 and CTE2 tables for this.

#### next, we create the modifiers for the flux values and the flux value to be added to the metadata table

This inplements the code from lines 282 - 287 in the original program, but as a set (not iteratively).

        flux_max = np.max(flux)
        flux_min = np.min(flux)
        flux_pow = math.log2(flux_max - flux_min)
        sample['hist'][:,1] /= math.pow(2, flux_pow)
        sample['hist'][:,2] /= math.pow(2, flux_pow)
        sample['meta'][9] = flux_pow / 10


In [13]:
modifiersSQL="""
select
object_id,
log2(max(flux)- min(flux)) flux_pow,
pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
log2(max(flux)- min(flux))/10 as metaVal
from training_set
group by object_id
"""
modifiersDF=sqlContext.sql(modifiersSQL)

# Create the in memory table
modifiersDF.registerTempTable("MODIFIERS")


How these modifiers are applied - we join the modifiers table to the padded training (lines 15-17) set and run the calculation in the select statement. (lines 11 and 12)

In [14]:
# Get the basic data from the padded training_set we created earlier, "PADDED_TRAINING_SET"
CTE1_sql="""
        select ts.object_id, mjd,
        mjd - first_value(mjd) over w as mjdInt,
        case when lag(mjd) OVER w is null then
            0
        else
            mjd - lag(mjd) over w 
        end as deltaMjd,
        passband,
        flux / HistModifier as flux,
        flux_err /HistModifier as flux_err,
        detected,
        row_number() OVER w as rownum
        from {} ts
            INNER JOIN {} mods
                ON ts.object_id = mods.object_id
        WINDOW w AS (PARTITION BY ts.object_id ORDER BY mjd)
""".format("PADDED_TRAINING_SET", "MODIFIERS")

CTE1_df=sqlContext.sql(CTE1_sql)

In [15]:
# Create an in memory table
CTE1_df.registerTempTable("CTE1")

In [16]:
CTE2_sql="""
        select object_id,
        first_value(mjd) OVER x - mjd as rval,
        row_number() OVER x as rownum
        from {}
        WINDOW x AS (PARTITION BY object_id ORDER BY mjd DESC)
""".format("PADDED_TRAINING_SET")

CTE2_df=sqlContext.sql(CTE2_sql)

In [17]:
# Create an in memory table
CTE2_df.registerTempTable("CTE2")

#### So in here we include the calculated fields for the metadata as well.

We do need to add modifier values based on minimum and maximum flux values for each observed object - this is the metaVal field from the MODIFIERS table

In [18]:
# Create the metadata we need for the feature vectors
meta_sql="""
        select meta.object_id, gal_l, gal_b, ddf, hostgal_specz, hostgal_photoz, hostgal_photoz_err, mwebv,target,
        case when hostgal_photoz > 0 
            then 1  -- CAST(1 AS BOOLEAN)
            else 0 --CAST(0 AS BOOLEAN)
            end as photoz_positive,
        --6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99
        case 
            when target= 6 then 0
            when target= 15 then 1
            when target= 16 then 2
            when target= 42 then 3
            when target= 52 then 4
            when target= 53 then 5
            when target= 62 then 6
            when target= 64 then 7
            when target= 65 then 8
            when target= 67 then 9
            when target= 88 then 10
            when target= 90 then 11
            when target= 92 then 12
            when target= 95 then 13
            when target= 99 then 14
            else 14
            end mapped_target,
            metaVal

        from {} meta
            INNER JOIN {} mods
                ON meta.object_id = mods.object_id

""".format("TRAINING_SET_METADATA", "MODIFIERS")

meta_df=sqlContext.sql(meta_sql)
meta_df.registerTempTable("meta")

### Create the intermediate table

This table sets up the arrays for the object metadata, spaced out to ten elements and create the key value pairs for the mjd, passband, flux etc arrays that will be created in the next step.


In [19]:
struct_sql="""
        select CTE1.object_id,mapped_target as target,
        array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,photoz_positive,metaVal) as meta,
        double(hostgal_specz) as specz,
        MAP(
            'interval', mjdInt,
            'deltaMjd', deltaMjd,
            'passband',passband,
            'rval', rval,
            'flux',flux,
            'flux_err',flux_err,
            'detected',detected,
            'received_wavelength', case 
                                    when CTE1.passband = 0 then 357/1000
                                    when CTE1.passband = 1 then 477/1000
                                    when CTE1.passband = 2 then 621/1000
                                    when CTE1.passband = 3 then 754/1000
                                    when CTE1.passband = 4 then 871/1000
                                    else  1004/1000
                                    end,
            'source_wavelength', case 
                                    when CTE1.passband = 0 then 357 / (meta.hostgal_photoz + 1)/1000
                                    when CTE1.passband = 1 then 477 / (meta.hostgal_photoz + 1)/1000
                                    when CTE1.passband = 2 then 621 / (meta.hostgal_photoz + 1)/1000
                                    when CTE1.passband = 3 then 754 / (meta.hostgal_photoz + 1)/1000
                                    when CTE1.passband = 4 then 871 / (meta.hostgal_photoz + 1)/1000
                                    else 1004 / (meta.hostgal_photoz + 1)/1000
                                    end
        
        ) AS kv
        from CTE1 
            inner join CTE2
                on CTE1.object_id=CTE2.object_id
                and CTE1. rownum=CTE2.rownum
            inner join meta
                on CTE1.object_id = meta.object_id
"""

struct_df=sqlContext.sql(struct_sql)
struct_df.registerTempTable("struct")

### And finally, we create the full training set of feature vectors, padded to 256 elements.

Next cell illustrates the original structured record with hist as an array of arrays.

And the next well illustrates the simplified structure, for ease of creating assembled feature vectors for Elephas

This on is an experiment for the Keras Elepha pipeline, using the custom model from the plasticc_rnn code

In [21]:
getVectorsSql="""
select object_id,meta,target,specz,
collect_list(int(a.kv['passband']))as band,
ARRAY(NAMED_STRUCT(
    'interval',             collect_list(float(a.kv['interval'])) ,
    'deltaMjd',             collect_list(float(a.kv['deltaMjd'])) ,
    'rval',                 collect_list(float(a.kv['rval'])) ,
    'flux',                 collect_list(float(a.kv['flux'])) ,
    'flux_err',             collect_list(float(a.kv['flux_err'])) ,
    'detected',             collect_list(int(a.kv['detected'])) ,
    'source_wavelength',    collect_list(float(a.kv['source_wavelength'])) ,
    'received_wavelength',  collect_list(float(a.kv['received_wavelength'])) 
    )
) as hist
from struct a
group by object_id, meta,target,specz
"""

vectors_df=sqlContext.sql(getVectorsSql)#.cache()   #.persist(StorageLevel.MEMORY_ONLY_SER_2)

#### Display the schema for the feature vectors

In [22]:
vectors_df.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- meta: array (nullable = false)
 |    |-- element: double (containsNull = true)
 |-- target: integer (nullable = false)
 |-- specz: double (nullable = true)
 |-- band: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- hist: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- interval: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- deltaMjd: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- rval: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux_err: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- detected: array (nullable = true)
 |    |    |    |-- element: integer (containsNull =

In [23]:
vectors_df.explain()

== Physical Plan ==
ObjectHashAggregate(keys=[object_id#25, meta#129, target#128, specz#130], functions=[collect_list(cast(kv#131[passband] as int), 0, 0), collect_list(cast(kv#131[interval] as float), 0, 0), collect_list(cast(kv#131[deltaMjd] as float), 0, 0), collect_list(cast(kv#131[rval] as float), 0, 0), collect_list(cast(kv#131[flux] as float), 0, 0), collect_list(cast(kv#131[flux_err] as float), 0, 0), collect_list(cast(kv#131[detected] as int), 0, 0), collect_list(cast(kv#131[source_wavelength] as float), 0, 0), collect_list(cast(kv#131[received_wavelength] as float), 0, 0)])
+- ObjectHashAggregate(keys=[object_id#25, meta#129, target#128, specz#130], functions=[partial_collect_list(cast(kv#131[passband] as int), 0, 0), partial_collect_list(cast(kv#131[interval] as float), 0, 0), partial_collect_list(cast(kv#131[deltaMjd] as float), 0, 0), partial_collect_list(cast(kv#131[rval] as float), 0, 0), partial_collect_list(cast(kv#131[flux] as float), 0, 0), partial_collect_list(cast(

## And finally, we create the feature vector table in hive

In [25]:
MODE='overwrite'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
TABLE='training_set_vectors' #- original full hist ARRAY - STRUCT - ARRAY
#TABLE='training_set_elephas'

vectors_df.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)
