In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import math
import copy
import random
import time
import sys

from pyspark import SparkConf,SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors

In [2]:
sqlContext = SQLContext(sc)
sqlContext.sql("use plasticc")

DataFrame[]

# Create the padded training_set
We need to create the padded training set

In [3]:
training_set = "training_set"
training_metadata = "training_set_metadata"

In [4]:
paddedSQL="""
with
cnt
as
(
    select rownum from 
    (
        select row_number() over (ORDER BY object_id) as rownum
        from {}
    ) a
    where rownum <=256
),
objects as (select object_id, 0 padMJD, 0 padPassband,0 padFlux, 0 padFlux_err,0 padDetected from {} group by object_id),
baseline as (select * from objects CROSS JOIN cnt ), -- cartesian product with 256 values to use as the baseline
train_set as (select *, row_number() over (partition by object_id order by mjd desc) as rownum from {}),
paddedRev as (
    select baseline.object_id, --train_set.mjd, baseline.padMJD,
    case when train_set.mjd is null then baseline.padMJD else train_set.mjd end mjd,
    case when train_set.passband is null then baseline.padPassband else train_set.passband end passband,
    case when train_set.flux is null then baseline.padFlux else train_set.mjd end flux,
    case when train_set.flux_err is null then baseline.padFlux_err else train_set.flux_err end flux_err,
    case when train_set.detected is null then baseline.padDetected else train_set.detected end detected
    from baseline left outer join train_set on baseline.object_id = train_set.object_id and baseline.rownum=train_set.rownum
    order by baseline.object_id, mjd desc
)
select object_id, mjd, passband, flux, flux_err, detected from paddedRev order by object_id, mjd
""".format(training_metadata,training_set,training_set)
paddedTrainingSet_DF = sqlContext.sql(paddedSQL)
paddedTrainingSet_DF.registerTempTable("PADDED_TRAINING_SET")

In [5]:
metadataSQL="""
select object_id,
ra,decl,gal_l,gal_b,ddf,hostgal_specz,
rand()*((hostgal_photoz+hostgal_photoz_err)-((hostgal_photoz-hostgal_photoz_err)/1.5))+((hostgal_photoz-hostgal_photoz_err)/1.5) hostgal_photoz,
hostgal_photoz_err, distmod,mwebv,target
from {}
""".format(training_metadata)
metadata_DF = sqlContext.sql(metadataSQL)
metadata_DF.registerTempTable("AUGMENTED_METADATA")

In [6]:
paddedSQL="""
with New_Training_set as(
    select ts.object_id, ts.mjd, ts.passband,ts.flux,
   rand()*((ts.flux+ts.flux_err)-((ts.flux-ts.flux_err)/1.5))+((ts.flux-ts.flux_err)/1.5)  newFlux,
    ts.flux_err,ts.detected,
    (1+tsm.hostgal_photoz)/( 1+ (rand()*((tsm.hostgal_photoz+tsm.hostgal_photoz_err)-((tsm.hostgal_photoz-tsm.hostgal_photoz_err)/1.5))+((tsm.hostgal_photoz-tsm.hostgal_photoz_err)/1.5))) dt
    from {} ts
        inner join training_set_metadata tsm
            on ts.object_id = tsm.object_id
)
select new_training_set.object_id,
new_training_set.mjd*new_training_set.dt as mjd,  new_training_set.passband, new_training_set.newflux as flux, 
new_training_set.flux_err, new_training_set.detected
from New_Training_set
""".format("PADDED_TRAINING_SET")

augmented_DF = sqlContext.sql(paddedSQL)
augmented_DF.registerTempTable("AUGMENTED_TRAINING")


In [7]:
def AugmentTheTableSparkAppend(): 
    augmentInsertSQL="""
    with 
        CTE1 as ( 
            select object_id, mjd,
            mjd - first_value(mjd) over w as mjdInt,
            case when lag(mjd) OVER w is null then
                0
            else
                mjd - lag(mjd) over w 
            end as deltaMjd,
            passband,
            flux,
            flux_err,
            detected,
            row_number() OVER w as rownum
            from {}
            WINDOW w AS (PARTITION BY object_id ORDER BY mjd)
        ),
            CTE2 as (
            select object_id, --mjd, lag(mjd) OVER x as lag,
            --first_value(mjd) OVER x as fval,
            first_value(mjd) OVER x - mjd as rval,
            row_number() OVER x as rownum
            from {}
            WINDOW x AS (PARTITION BY object_id ORDER BY mjd DESC)
        ),
        meta as (
            select object_id, gal_l, gal_b, ddf, hostgal_specz, hostgal_photoz, hostgal_photoz_err, mwebv,target,
            case when hostgal_photoz > 0 
                then 1  -- CAST(1 AS BOOLEAN)
                else 0 --CAST(0 AS BOOLEAN)
                end as photoz_positive,
            --6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99
            case 
                when target= 6 then 0
                when target= 15 then 1
                when target= 16 then 2
                when target= 42 then 3
                when target= 52 then 4
                when target= 53 then 5
                when target= 62 then 6
                when target= 64 then 7
                when target= 65 then 8
                when target= 67 then 9
                when target= 88 then 10
                when target= 90 then 11
                when target= 92 then 12
                when target= 95 then 13
                when target= 99 then 14
                else 14
                end mapped_target

            from {}
        ),
        struct as
        (
            select CTE1.object_id,meta.mapped_target as target,
            array(0,0,0,0,ddf,hostgal_specz, hostgal_photoz,mwebv,photoz_positive,0) as meta,
            double(hostgal_specz) as specz,
            MAP(
                'interval', mjdInt,
                'deltaMjd', deltaMjd,
                'passband',passband,
                'rval', rval,
                'flux',flux,
                'flux_err',flux_err,
                'detected',detected,
                'received_frequency', case 
                                        when CTE1.passband = 0 then 300000 /357
                                        when CTE1.passband = 1 then 300000 /477
                                        when CTE1.passband = 2 then 300000 /621
                                        when CTE1.passband = 3 then 300000 /754
                                        when CTE1.passband = 4 then 300000 /871
                                        else  300000 / 1004
                                        end,
                'source_wavelength', case 
                                        when CTE1.passband = 0 then 357 / (meta.hostgal_photoz + 1)
                                        when CTE1.passband = 1 then 477 / (meta.hostgal_photoz + 1)
                                        when CTE1.passband = 2 then 621 / (meta.hostgal_photoz + 1)
                                        when CTE1.passband = 3 then 754 / (meta.hostgal_photoz + 1)
                                        when CTE1.passband = 4 then 871 / (meta.hostgal_photoz + 1)
                                        else 1004 / (meta.hostgal_photoz + 1)
                                        end

            ) AS kv
            from CTE1 
                inner join CTE2
                    on CTE1.object_id=CTE2.object_id
                    and CTE1. rownum=CTE2.rownum
                inner join meta
                    on CTE1.object_id = meta.object_id
        )    
    select object_id,meta,target,specz,
    collect_list(a.kv['passband'])  as band,
    collect_list(a.kv['interval']) as interval,
    collect_list(a.kv['deltaMjd']) as deltaMjd,
    collect_list(a.kv['rval']) as rval,
    collect_list(a.kv['flux']) as flux,
    collect_list(a.kv['flux_err']) as flux_err,
    collect_list(a.kv['detected']) as detected,
    collect_list(a.kv['source_wavelength']) as source_wavelength,
    collect_list(a.kv['received_wavelength']) as received_wavelength

    from struct a
    group by object_id, meta, target,specz
    """.format("AUGMENTED_TRAINING","AUGMENTED_TRAINING","AUGMENTED_METADATA")
    augmented_df=sqlContext.sql(augmentInsertSQL)
    #print(augmented_df.printSchema())
    
    MODE='append'
    FORMAT='parquet'
    #TABLE='training_set_augmented_vectors'
    TABLE='training_set_flat_augmented_vectors'
    
    augmented_df.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)

In [9]:
start=time.time()
startCpu=time.clock()

# first off, we copy the initial set of trining vectrs into the augment table
#trainingVectorsDF=sqlContext.sql("select * from training_set_vectors")- original full hist ARRAY - STRUCT - ARRAY
trainingVectorsDF=sqlContext.sql("select * from training_set_flat_vectors")
    
#MODE='overwrite'
#FORMAT='parquet'
#TABLE='training_set_augmented_vectors'  - original full hist ARRAY - STRUCT - ARRAY
#TABLE='training_set_flat_augmented_vectors'


# comment out the next two lines if you're just augmenting, not creting from scratch
#trainingVectorsDF.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)
#print("Initial insertion complete...")

# and now, we augment!
for i in range(12):
    print(i)
    AugmentTheTableSparkAppend()
    print("done!")

end=time.time()-start
endCpu=time.clock()-startCpu
print("--- Full augment - Elapsed - %s seconds - Cpu seconds %s ---" % (end, endCpu))


0
done!
1
done!
2
done!
3
done!
4
done!
5
done!
6
done!
7
done!
8
done!
9
done!
10
done!
11
done!
--- Full augment - Elapsed - 1165.3623864650726 seconds - Cpu seconds 0.5361880000000001 ---
