# Create the training set basic vector dataframe - unpadded, no calculations

## First! we set up the Spark Context

In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '4g'),\
                           ('spark.driver.memory', '4g'),\
                           ('spark.driver.maxResultSize', 0), \
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50), \
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.executor.cores', 4),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','Creating training set vectors - JupyterHub version')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('Create_vectors_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

In [4]:
import gc # manual garbag collection to stop leaks on Collect() gc.collect()

In [5]:
pgm_start=time.time()
pgm_startCpu=time.clock()

In [6]:
augment_count = 35
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [7]:
sqlContext = SQLContext(sc)

In [8]:
sqlContext.sql("use plasticc")

DataFrame[]

### And finally, we create the full raw training set of feature vectors, unpadded, no calculations

Next cell illustrates the original structured record with hist as an array of arrays.

#### Note that there is no cast or collect_list to array (array (collect_list(...) )
This changes the sghape of the retrieved aray in the programs

# NOTE!!
Be sure that the datatypes are explicitly defined in the collect_list statement!

Otherwise is (apparently) loks like hive will implicitly case to a string(? Not sure) but the end result is that if you don't, the table data footprint end up almost twice as large as it should be.

In [10]:
getVectorsSql="""
select object_id,target,meta,specz,
collect_list( int( a.kv['passband'])) as band,
ARRAY(NAMED_STRUCT(
    'mjd',                  collect_list(float(a.kv['mjd'])) ,
    'flux',                 collect_list(float(a.kv['flux'])) ,
    'flux_err',             collect_list(float(a.kv['flux_err'])) ,
    'detected',             collect_list(float(a.kv['detected'])) 
    )
) as hist
from
(
select ts.object_id,target,array(ddf,hostgal_specz, hostgal_photoz, hostgal_photoz_err, mwebv) as meta,double(hostgal_specz) as specz,
   MAP(
        'mjd', mjd,
        'passband',passband,
        'flux',flux,
        'flux_err',flux_err,
        'detected',detected
   ) AS kv
from training_set ts inner join training_set_metadata tsm on ts.object_id = tsm.object_id
) a
group by object_id, target,meta, specz
"""

vectors_df=sqlContext.sql(getVectorsSql).cache()   #.persist(StorageLevel.MEMORY_ONLY_SER_2)

#### Display the schema for the feature vectors

In [11]:
vectors_df.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- meta: array (nullable = false)
 |    |-- element: double (containsNull = true)
 |-- specz: double (nullable = true)
 |-- band: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- hist: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- mjd: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- passband: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- flux: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- flux_err: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- detected: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)



In [12]:
vectors_df.explain()

== Physical Plan ==
InMemoryTableScan [object_id#5, target#22, meta#0, specz#1, band#3, hist#4]
   +- InMemoryRelation [object_id#5, target#22, meta#0, specz#1, band#3, hist#4], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)
         +- ObjectHashAggregate(keys=[object_id#5, target#22, meta#0, specz#1], functions=[collect_list(kv#2[passband], 0, 0), collect_list(kv#2[mjd], 0, 0), collect_list(kv#2[passband], 0, 0), collect_list(kv#2[flux], 0, 0), collect_list(kv#2[flux_err], 0, 0), collect_list(kv#2[detected], 0, 0)])
            +- Exchange hashpartitioning(object_id#5, target#22, meta#0, specz#1, 200)
               +- ObjectHashAggregate(keys=[object_id#5, target#22, meta#0, specz#1], functions=[partial_collect_list(kv#2[passband], 0, 0), partial_collect_list(kv#2[mjd], 0, 0), partial_collect_list(kv#2[passband], 0, 0), partial_collect_list(kv#2[flux], 0, 0), partial_collect_list(kv#2[flux_err], 0, 0), partial_collect_list(kv#2[detected], 0, 0)])
                 

## And finally, we create the feature vector table in hive

In [13]:
MODE='append'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
#TABLE='training_set_vectors' - original full hist ARRAY - STRUCT - ARRAY
TABLE='training_raw_vectors_unpadded_no_calcs'

vectors_df.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)
