# Create the full test vector dataframe using Spark temp tables

## First! we set up the Spark Context

In [1]:
# Set the new configuration
conf = SparkConf().setAll([('spark.executor.memory', '4g'),\
                           ('spark.driver.memory', '4g'),\
                           ('spark.driver.maxResultSize', 0), \
                           ('spark.shuffle.service.enabled', True), \
                           ('spark.dynamicAllocation.enabled', True), \
                           #('spark.executor.instances', 50), \
                           ('spark.dynamicAllocation.executorIdleTimeout', 600), \
                           ('spark.executor.cores', 4),\
                           ('spark.default.parallelism', 90),\
                           ('spark.executor.memoryOverhead', '4g'),\
                           ('spark.driver.memoryOverhead', '4g'),\
                           ('spark.scheduler.mode', 'FAIR'),\
                           ('spark.kryoserializer.buffer.max', '512m'),\
                           ('spark.app.name','Creating training set vectors - JupyterHub version')])# Show the current options




#                           ('spark.dynamicAllocation.maxExecutors', 90), \


# Stop the old context
sc.stop()

# And restart the context with the new configuration
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('Create_vectors_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

In [4]:
import gc # manual garbag collection to stop leaks on Collect() gc.collect()

In [5]:
pgm_start=time.time()
pgm_startCpu=time.clock()

In [6]:
augment_count = 35
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [7]:
sqlContext = SQLContext(sc)

In [8]:
sqlContext.sql("use plasticc")

DataFrame[]

#### Refer to this great article, SQL at Scale with Apache Spark SQL and DataFrames

https://towardsdatascience.com/sql-at-scale-with-apache-spark-sql-and-dataframes-concepts-architecture-and-examples-c567853a702f

#### Benchmark snippets that may be usefull
https://community.cloudera.com/t5/Community-Articles/Spark-RDDs-vs-DataFrames-vs-SparkSQL/ta-p/246547

# Create the test vector set

In [9]:
testVectorsDF=sqlContext.sql("""
with rawData as
(
    select ts.object_id,
        0 target,
       array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,case when hostgal_photoz > 0 then 1 else 0 end ,metaVal) as meta,
       double(hostgal_specz) as specz,
       MAP(
            'mjd', 0,
            'passband',passband,
            'flux',flux / mods.HistModifier,
            'flux_err',flux_err / mods.HistModifier,
            'fwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'bwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'detected',0,
            'source_wavelength', case 
                                    when ts.passband = 0 then 357 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 1 then 477 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 2 then 621 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 3 then 754 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 4 then 871 / (tsm.hostgal_photoz + 1)/1000
                                    else 1004 / (tsm.hostgal_photoz + 1)/1000
                                    end,
            'received_wavelength', 0
    
       ) AS kv
    from test_set_compressed ts 
        inner join test_set_metadata tsm 
            on ts.object_id = tsm.object_id 
        inner join (
            select
            object_id,
            log2(max(flux)- min(flux)) flux_pow,
            pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
            log2(max(flux)- min(flux))/10 as metaVal
            from test_set_compressed
            group by object_id
    
        ) mods
        on ts.object_id = mods.object_id
    WINDOW W AS (PARTITION BY ts.object_id ORDER BY mjd)
) 
select object_id, target,meta,
collect_list(int(a.kv['passband']))as band,
collect_list(float(a.kv['mjd'])) as mjd,
collect_list(float(a.kv['flux'])) as flux,
collect_list(float(a.kv['flux_err'])) as flux_err,
collect_list(int(a.kv['detected'])) as detected,
collect_list(float(a.kv['fwd_int'])) as fwd_int,
collect_list(float(a.kv['bwd_int'])) as bwd_int,
collect_list(float(a.kv['source_wavelength'])) as source_wavelength,
collect_list(float(a.kv['received_wavelength'])) as received_wavelength
from rawData a
group by object_id, target,meta
""")

In [10]:
MODE='overwrite'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
TABLE='elephas_test_set' #- original full hist ARRAY - STRUCT - ARRAY
#TABLE='training_set_elephas'

testVectorsDF.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)

## Create the base table

In [40]:
trainingVectorsDF=sqlContext.sql("""
with rawData as
(
    select ts.object_id,
        case 
                when target= 6 then 0
                when target= 15 then 1
                when target= 16 then 2
                when target= 42 then 3
                when target= 52 then 4
                when target= 53 then 5
                when target= 62 then 6
                when target= 64 then 7
                when target= 65 then 8
                when target= 67 then 9
                when target= 88 then 10
                when target= 90 then 11
                when target= 92 then 12
                when target= 95 then 13
                when target= 99 then 14
                else 14
                end target,
       array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,case when hostgal_photoz > 0 then 1 else 0 end ,metaVal) as meta,
       double(hostgal_specz) as specz,
       MAP(
            'mjd', 0,
            'passband',passband,
            'flux',flux / mods.HistModifier,
            'flux_err',flux_err / mods.HistModifier,
            'fwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'bwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'detected',0,
            'source_wavelength', case 
                                    when ts.passband = 0 then 357 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 1 then 477 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 2 then 621 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 3 then 754 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 4 then 871 / (tsm.hostgal_photoz + 1)/1000
                                    else 1004 / (tsm.hostgal_photoz + 1)/1000
                                    end,
            'received_wavelength', 0
    
       ) AS kv
    from training_set ts 
        inner join training_set_metadata tsm 
            on ts.object_id = tsm.object_id 
        inner join (
            select
            object_id,
            log2(max(flux)- min(flux)) flux_pow,
            pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
            log2(max(flux)- min(flux))/10 as metaVal
            from training_set
            group by object_id
    
        ) mods
        on ts.object_id = mods.object_id
    WINDOW W AS (PARTITION BY ts.object_id ORDER BY mjd)
) 
select object_id, target,meta,
collect_list(int(a.kv['passband']))as band,
collect_list(float(a.kv['mjd'])) as mjd,
collect_list(float(a.kv['flux'])) as flux,
collect_list(float(a.kv['flux_err'])) as flux_err,
collect_list(int(a.kv['detected'])) as detected,
collect_list(float(a.kv['fwd_int'])) as fwd_int,
collect_list(float(a.kv['bwd_int'])) as bwd_int,
collect_list(float(a.kv['source_wavelength'])) as source_wavelength,
collect_list(float(a.kv['received_wavelength'])) as received_wavelength
from rawData a
group by object_id, target,meta
""")

In [41]:
MODE='overwrite'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
TABLE='elephas_training_set' #- original full hist ARRAY - STRUCT - ARRAY
#TABLE='training_set_elephas'

trainingVectorsDF.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)

## Get the highest object_id

In [102]:
for i in range(10):
    print(i)
    #objAdjuster=sqlContext.sql("select max(object_id) adjustment from elephas_training_set").collect()[0][0]
    print(objAdjuster)
    sql= """
    with NEW_Metadata as (
    select object_id,
    ra,decl,gal_l,gal_b,ddf,hostgal_specz,
    rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5) hostgal_photoz,
    hostgal_photoz_err, distmod,mwebv,target
    from training_set_metadata
    )
    select object_id, ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
    from NEW_Metadata    
    """
    
    newMetadataDF=sqlContext.sql(sql)
    newMetadataDF.registerTempTable("AUGMENTED_METADATA")
    
    sql="""
    with New_Training_set as(
        select ts.object_id, mjd, passband,flux,
        rand()*((flux+flux_err)-((flux-flux_err)/1.5))+((flux-flux_err)/1.5)  newFlux,
        flux_err,detected,
        (1+hostgal_photoz)/( 1+ (rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5))) dt
        from training_set ts
            inner join training_set_metadata tsm
                on ts.object_id = tsm.object_id
    )
    select object_id,
    mjd*dt as mjd,  passband, newFlux as flux, flux_err, detected
    from New_Training_set    
    """
    
    newTrainingSetDF=sqlContext.sql(sql)
    newTrainingSetDF.registerTempTable("AUGMENTED_TRAINING_SET")
    
    sql="""
    with rawData as
    (
        select ts.object_id,
            case 
                    when target= 6 then 0
                    when target= 15 then 1
                    when target= 16 then 2
                    when target= 42 then 3
                    when target= 52 then 4
                    when target= 53 then 5
                    when target= 62 then 6
                    when target= 64 then 7
                    when target= 65 then 8
                    when target= 67 then 9
                    when target= 88 then 10
                    when target= 90 then 11
                    when target= 92 then 12
                    when target= 95 then 13
                    when target= 99 then 14
                    else 14
                    end target,
           array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,case when hostgal_photoz > 0 then 1 else 0 end ,metaVal) as meta,
           double(hostgal_specz) as specz,
           MAP(
                'mjd', 0,
                'passband',passband,
                'flux',flux / mods.HistModifier,
                'flux_err',flux_err / mods.HistModifier,
                'fwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
                'bwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
                'detected',0,
                'source_wavelength', case 
                                        when ts.passband = 0 then 357 / (tsm.hostgal_photoz + 1)/1000
                                        when ts.passband = 1 then 477 / (tsm.hostgal_photoz + 1)/1000
                                        when ts.passband = 2 then 621 / (tsm.hostgal_photoz + 1)/1000
                                        when ts.passband = 3 then 754 / (tsm.hostgal_photoz + 1)/1000
                                        when ts.passband = 4 then 871 / (tsm.hostgal_photoz + 1)/1000
                                        else 1004 / (tsm.hostgal_photoz + 1)/1000
                                        end,
                'received_wavelength', 0

           ) AS kv
        from {} ts 
            inner join {} tsm 
                on ts.object_id = tsm.object_id 
            inner join (
                select
                object_id,
                log2(max(flux)- min(flux)) flux_pow,
                pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
                log2(max(flux)- min(flux))/10 as metaVal
                from training_set
                group by object_id

            ) mods
            on ts.object_id = mods.object_id
        WINDOW W AS (PARTITION BY ts.object_id ORDER BY mjd)
    ) 
    select object_id, target,meta,
    collect_list(int(a.kv['passband']))as band,
    collect_list(float(a.kv['mjd'])) as mjd,
    collect_list(float(a.kv['flux'])) as flux,
    collect_list(float(a.kv['flux_err'])) as flux_err,
    collect_list(int(a.kv['detected'])) as detected,
    collect_list(float(a.kv['fwd_int'])) as fwd_int,
    collect_list(float(a.kv['bwd_int'])) as bwd_int,
    collect_list(float(a.kv['source_wavelength'])) as source_wavelength,
    collect_list(float(a.kv['received_wavelength'])) as received_wavelength
    from rawData a
    group by object_id, target,meta    
    """.format("AUGMENTED_TRAINING_SET", "AUGMENTED_METADATA")
    
    trainingVectorsDF=sqlContext.sql(sql)
    trainingVectorsDF.count()
    MODE='append'
    FORMAT='parquet'
    TABLE='elephas_training_set'
    trainingVectorsDF.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)
    
    print("finished!")

0
130779836
finished!
1
130779836
finished!
2
130779836
finished!
3
130779836
finished!
4
130779836
finished!
5
130779836
finished!
6
130779836
finished!
7
130779836
finished!
8
130779836
finished!
9
130779836
finished!


In [103]:
testDF=sqlContext.sql("select * from elephas_training_set")
testDF.count()

196200

# Everything below is just testing

In [62]:
objAdjuster=sqlContext.sql("select max(object_id) adjustment from elephas_training_set").collect()[0][0]

## First, we create the new metadata table



In [73]:
sql="""
with NEW_Metadata as (
select object_id,
ra,decl,gal_l,gal_b,ddf,hostgal_specz,
rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5) hostgal_photoz,
hostgal_photoz_err, distmod,mwebv,target
from training_set_metadata
)
select object_id, ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
from NEW_Metadata"""

In [74]:
newMetadataDF=sqlContext.sql(sql)
newMetadataDF.registerTempTable("AUGMENTED_METADATA")

In [75]:
newMetadataDF.count()

7848

## Now create the augmented training set

In [76]:
sql="""
with New_Training_set as(
    select ts.object_id, mjd, passband,flux,
    rand()*((flux+flux_err)-((flux-flux_err)/1.5))+((flux-flux_err)/1.5)  newFlux,
    flux_err,detected,
    (1+hostgal_photoz)/( 1+ (rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5))) dt
    from training_set ts
        inner join training_set_metadata tsm
            on ts.object_id = tsm.object_id
)
select object_id,
mjd*dt as mjd,  passband, newFlux as flux, flux_err, detected
from New_Training_set
"""

In [66]:
sql

'\nwith New_Training_set as(\n    select ts.object_id, mjd, passband,flux,\n    rand()*((flux+flux_err)-((flux-flux_err)/1.5))+((flux-flux_err)/1.5)  newFlux,\n    flux_err,detected,\n    (1+hostgal_photoz)/( 1+ (rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5))) dt\n    from training_set ts\n        inner join training_set_metadata tsm\n            on ts.object_id = tsm.object_id\n)\nselect object_id+130779836+1 object_id,\nmjd*dt as mjd,  passband, newFlux as flux, flux_err, detected\nfrom New_Training_set\n'

In [77]:
newTrainingSetDF=sqlContext.sql(sql)
newTrainingSetDF.registerTempTable("AUGMENTED_TRAINING_SET")

In [78]:
newTrainingSetDF.count()

1421705

In [93]:
sql="""
with rawData as
(
    select ts.object_id,
        case 
                when target= 6 then 0
                when target= 15 then 1
                when target= 16 then 2
                when target= 42 then 3
                when target= 52 then 4
                when target= 53 then 5
                when target= 62 then 6
                when target= 64 then 7
                when target= 65 then 8
                when target= 67 then 9
                when target= 88 then 10
                when target= 90 then 11
                when target= 92 then 12
                when target= 95 then 13
                when target= 99 then 14
                else 14
                end target,
       array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,case when hostgal_photoz > 0 then 1 else 0 end ,metaVal) as meta,
       double(hostgal_specz) as specz,
       MAP(
            'mjd', 0,
            'passband',passband,
            'flux',flux / mods.HistModifier,
            'flux_err',flux_err / mods.HistModifier,
            'fwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'bwd_int', (mjd - first_value(mjd) over W) / (tsm.hostgal_photoz + 1),
            'detected',0,
            'source_wavelength', case 
                                    when ts.passband = 0 then 357 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 1 then 477 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 2 then 621 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 3 then 754 / (tsm.hostgal_photoz + 1)/1000
                                    when ts.passband = 4 then 871 / (tsm.hostgal_photoz + 1)/1000
                                    else 1004 / (tsm.hostgal_photoz + 1)/1000
                                    end,
            'received_wavelength', 0
    
       ) AS kv
    from {} ts 
        inner join {} tsm 
            on ts.object_id = tsm.object_id 
        inner join (
            select
            object_id,
            log2(max(flux)- min(flux)) flux_pow,
            pow(2,log2(max(flux)- min(flux)) ) as HistModifier,
            log2(max(flux)- min(flux))/10 as metaVal
            from training_set
            group by object_id
    
        ) mods
        on ts.object_id = mods.object_id
    WINDOW W AS (PARTITION BY ts.object_id ORDER BY mjd)
) 
select object_id, target,meta,
collect_list(int(a.kv['passband']))as band,
collect_list(float(a.kv['mjd'])) as mjd,
collect_list(float(a.kv['flux'])) as flux,
collect_list(float(a.kv['flux_err'])) as flux_err,
collect_list(int(a.kv['detected'])) as detected,
collect_list(float(a.kv['fwd_int'])) as fwd_int,
collect_list(float(a.kv['bwd_int'])) as bwd_int,
collect_list(float(a.kv['source_wavelength'])) as source_wavelength,
collect_list(float(a.kv['received_wavelength'])) as received_wavelength
from rawData a
group by object_id, target,meta
""".format("AUGMENTED_TRAINING_SET", "AUGMENTED_METADATA")


In [94]:
trainingVectorsDF=sqlContext.sql(sql)

In [95]:
trainingVectorsDF.count()

7848

In [70]:
trainingVectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = false)
 |-- meta: array (nullable = false)
 |    |-- element: double (containsNull = true)
 |-- band: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- mjd: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- flux: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- flux_err: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- detected: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- fwd_int: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- bwd_int: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- source_wavelength: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- received_wavelength: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [71]:
trainingVectorsDF.count()

0

In [96]:
MODE='append'
FORMAT='parquet'
#TABLE='training_set_flat_vectors' - this is the flattened model for the elephas sequential test
TABLE='elephas_training_set' #- original full hist ARRAY - STRUCT - ARRAY
#TABLE='training_set_elephas'

trainingVectorsDF.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)