# Create the training set basic vector dataframe - unpadded, with calculations

## First! we set up the Spark Context

In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '4g'),\
                           ('spark.driver.memory', '4g'),\
                           ('spark.driver.maxResultSize', 0), \
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50), \
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.executor.cores', 4),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','Creating training set vectors - JupyterHub version')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('Create_vectors_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

In [4]:
import gc # manual garbag collection to stop leaks on Collect() gc.collect()

In [5]:
pgm_start=time.time()
pgm_startCpu=time.clock()

In [6]:
augment_count = 35
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [7]:
sqlContext = SQLContext(sc)

In [8]:
sqlContext.sql("use plasticc")

DataFrame[]

### And finally, we create the full raw training set of feature vectors, unpadded, no calculations

Next cell illustrates the original structured record with hist as an array of arrays.

In [9]:
vectors_df=sqlContext.sql("""
with rawData as
(
    select ts.object_id,
        case 
                when target= 6 then 0
                when target= 15 then 1
                when target= 16 then 2
                when target= 42 then 3
                when target= 52 then 4
                when target= 53 then 5
                when target= 62 then 6
                when target= 64 then 7
                when target= 65 then 8
                when target= 67 then 9
                when target= 88 then 10
                when target= 90 then 11
                when target= 92 then 12
                when target= 95 then 13
                when target= 99 then 14
                else 14
                end target,
       array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,case when hostgal_photoz > 0 then 1 else 0 end ,metaVal) as meta,
       double(hostgal_specz) as specz,
       MAP(
            'mjd', 0,
            'passband',passband,
            'flux',flux / mods.HistModifier,
            'flux_err',flux_err / mods.HistModifier,
            'fwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'bwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'detected',0,
            'source_wavelength', case 
                                    when ts.passband = 0 then 357 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 1 then 477 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 2 then 621 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 3 then 754 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 4 then 871 / (tsm.hostgal_photoz + 1)/1000
                                    else 1004 / (tsm.hostgal_photoz + 1)/1000
                                    end,
            'received_wavelength', 0
    
       ) AS kv
    from training_set ts 
        inner join training_set_metadata tsm 
            on ts.object_id = tsm.object_id 
        inner join (
            select
            object_id,
            log2(max(flux)- min(flux)) flux_pow,
            pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
            log2(max(flux)- min(flux))/10 as metaVal
            from training_set
            group by object_id
    
        ) mods
        on ts.object_id = mods.object_id
    WINDOW W AS (PARTITION BY ts.object_id ORDER BY mjd)
) 
select object_id, target,meta,
collect_list(int(a.kv['passband']))as band,
ARRAY(NAMED_STRUCT(
    'mjd', collect_list(float(a.kv['mjd'])) ,
    'flux', collect_list(float(a.kv['flux']))  ,
    'flux_err', collect_list(float(a.kv['flux_err']))  ,
    'detected', collect_list(int(a.kv['detected']))  ,
    'fwd_int', collect_list(float(a.kv['fwd_int']))  ,
    'bwd_int', collect_list(float(a.kv['bwd_int']))  ,
    'source_wavelength', collect_list(float(a.kv['source_wavelength']))  ,
    'received_wavelength', collect_list(float(a.kv['received_wavelength']))  
)) as hist
from rawData a
group by object_id, target,meta
""")

#### Display the schema for the feature vectors

In [10]:
vectors_df.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = false)
 |-- meta: array (nullable = false)
 |    |-- element: double (containsNull = true)
 |-- band: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- hist: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- mjd: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux_err: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- detected: array (nullable = true)
 |    |    |    |-- element: integer (containsNull = true)
 |    |    |-- fwd_int: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- bwd_int: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- source_wavelength:

In [11]:
vectors_df.explain()

== Physical Plan ==
ObjectHashAggregate(keys=[object_id#9, target#5, meta#6], functions=[collect_list(cast(kv#8[passband] as int), 0, 0), collect_list(cast(kv#8[mjd] as float), 0, 0), collect_list(cast(kv#8[flux] as float), 0, 0), collect_list(cast(kv#8[flux_err] as float), 0, 0), collect_list(cast(kv#8[detected] as int), 0, 0), collect_list(cast(kv#8[fwd_int] as float), 0, 0), collect_list(cast(kv#8[bwd_int] as float), 0, 0), collect_list(cast(kv#8[source_wavelength] as float), 0, 0), collect_list(cast(kv#8[received_wavelength] as float), 0, 0)])
+- ObjectHashAggregate(keys=[object_id#9, target#5, meta#6], functions=[partial_collect_list(cast(kv#8[passband] as int), 0, 0), partial_collect_list(cast(kv#8[mjd] as float), 0, 0), partial_collect_list(cast(kv#8[flux] as float), 0, 0), partial_collect_list(cast(kv#8[flux_err] as float), 0, 0), partial_collect_list(cast(kv#8[detected] as int), 0, 0), partial_collect_list(cast(kv#8[fwd_int] as float), 0, 0), partial_collect_list(cast(kv#8[bwd

## And finally, we create the feature vector table in hive

In [12]:
MODE='overwrite'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
#TABLE='training_set_vectors' - original full hist ARRAY - STRUCT - ARRAY
TABLE='training_set_raw_vectors_unpadded_with_calcs'

vectors_df.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)
