In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import math
import copy
import random
import time
import sys

from pyspark import SparkConf,SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors

In [2]:
from keras.layers import *
from keras.models import Model, load_model
from keras.optimizers import Adam, Nadam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix

In [4]:
import tensorflow as tf

In [5]:
sqlContext = SQLContext(sc)

In [6]:
sqlContext.sql("use plasticc")

DataFrame[]

In [7]:
augment_count = 25
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [8]:
classes = np.array([6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99], dtype='int32')
class_names = ['class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']
class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1, 99: 1}

# LSST passbands (nm)  u    g    r    i    z    y      
passbands = np.array([357, 477, 621, 754, 871, 1004], dtype='float32')

In [9]:
def append_data(list_x, list_y = None):
    X = {}
    for k in list_x[0].keys():

        list = [x[k] for x in list_x]
        X[k] = np.concatenate(list)

    if list_y is None:
        return X
    else:
        return X, np.concatenate(list_y)

In [25]:
def get_wtable(df):
    #x=np.array(raw_vectorsDF.select('target').collect())
    
    all_y = np.array(df.select('target').collect(), dtype = 'int32')

    y_count = np.unique(all_y, return_counts=True)[1]

    wtable = np.ones(len(classes))

    for i in range(0, y_count.shape[0]):
        wtable[i] = y_count[i] / all_y.shape[0]

    return wtable

In [11]:
def get_keras_data(itemslist):

    keys = itemslist[0].keys()
    print('creating X')
    X = {
            'id': np.array([i['id'] for i in itemslist], dtype='int32'),
            'meta': np.array([i['meta'] for i in itemslist]),
            'band': pad_sequences([i['band'] for i in itemslist], maxlen=sequence_len, dtype='int32'),
            'hist': pad_sequences([i['hist'] for i in itemslist], maxlen=sequence_len, dtype='float32'),
        }
    print('creating Y')
    Y = to_categorical([i['target'] for i in itemslist], num_classes=len(classes))

    X['hist'][:,:,0] = 0 # remove abs time
#    X['hist'][:,:,1] = 0 # remove flux
#    X['hist'][:,:,2] = 0 # remove flux err
    X['hist'][:,:,3] = 0 # remove detected flag
#    X['hist'][:,:,4] = 0 # remove fwd intervals
#    X['hist'][:,:,5] = 0 # remove bwd intervals
#    X['hist'][:,:,6] = 0 # remove source wavelength
    X['hist'][:,:,7] = 0 # remove received wavelength

    return X, Y

In [12]:
def set_intervals(sample):

    hist = sample['hist']
    band = sample['band']

    hist[:,4] = np.ediff1d(hist[:,0], to_begin = [0])
    hist[:,5] = np.ediff1d(hist[:,0], to_end = [0])

In [13]:
def copy_sample(s, augmentate=True):
    c = copy.deepcopy(s)

    if not augmentate:
        return c

    band = []
    hist = []

    drop_rate = 0.3

    # drop some records
    for k in range(s['band'].shape[0]):
        if random.uniform(0, 1) >= drop_rate:
            band.append(s['band'][k])
            hist.append(s['hist'][k])

    c['hist'] = np.array(hist, dtype='float32')
    c['band'] = np.array(band, dtype='int32')

    set_intervals(c)
            
    new_z = random.normalvariate(c['meta'][5], c['meta'][6] / 1.5) # hostgal_photoz and hostgal_photoz_err
    new_z = max(new_z, 0)
    new_z = min(new_z, 5)

    dt = (1 + c['meta'][5]) / (1 + new_z) # hostgal_photoz
    c['meta'][5] = new_z

    # augmentation for flux
    c['hist'][:,1] = np.random.normal(c['hist'][:,1], c['hist'][:,2] / 1.5) # flux and flux_err

    # multiply time intervals and wavelength to apply augmentation for red shift
    c['hist'][:,0] *= dt
    c['hist'][:,4] *= dt
    c['hist'][:,5] *= dt
    c['hist'][:,6] *= dt

    return c

In [14]:
def normalize_counts(samples, wtable, augmentate):
    maxpr = np.max(wtable)
    counts = maxpr / wtable

    res = []
    index = 0
    for s in samples:

        index += 1
        print('Normalizing {0}/{1}   '.format(index, len(samples)), end='\r')

        res.append(s)
        count = int(3 * counts[s['target']]) - 1

        for i in range(0, count):
            res.append(copy_sample(s, augmentate))

    print()

    return res

In [15]:
def augmentate(samples, gl_count, exgl_count):

    res = []
    index = 0
    for s in samples:

        index += 1
        
        if index % 1000 == 0:
            print('Augmenting {0}/{1}   '.format(index, len(samples)), end='\r')

        count = gl_count if (s['meta'][8] == 0) else exgl_count

        for i in range(0, count):
            res.append(copy_sample(s))

    print()
    return res

In [29]:
def get_data(raw_vectors_df, extragalactic=None, use_specz=False):

    samples = []
    list_objects = map(lambda row: row.asDict(), raw_vectorsDF.collect())
    object_vectors = {object['object_id']: object for object in list_objects}


    for key in object_vectors.keys():
        print(key)
        i=object_vectors.get(key)

        id=i.get('object_id')

        sample = {}
        sample['id'] = int(id)

        # 'object_id', 'target', 'meta', 'specz', 'band', 'hist'

        #sample['target'] = int(i.get('target'))
        sample['target'] = np.where(classes == int(i.get('target')))[0][0] # positional index of the classes array

        meta=np.array(i.get('meta'), dtype='float32')

        sample['meta'] = np.zeros(10, dtype = 'float32')

            #sample['meta'][4] = meta['ddf']					from meta column array meta[0]
            #sample['meta'][5] = meta['hostgal_photoz']			from meta column array meta[2]
            #sample['meta'][6] = meta['hostgal_photoz_err']		from meta column array meta[3]
            #sample['meta'][7] = meta['mwebv']					from meta column array meta[4]
            #sample['meta'][8] = float(meta['hostgal_photoz']) > 0  returns True or false

            #sample['specz'] = float(meta['hostgal_specz'])		from meta column array meta[1]


        sample['meta'][4] = meta[0]
        sample['meta'][5] = meta[2]
        sample['meta'][6] = meta[3]
        sample['meta'][7] = meta[4]
        sample['meta'][8] = float(meta[2]) > 0

        sample['specz'] = float(meta[1])    

        if use_specz:
            sample['meta'][5] = float(meta['hostgal_specz'])
            sample['meta'][6] = 0.0

        z = float(sample['meta'][5])

        j=i.get('hist')
        #mjd=np.array(j[0][0], dtype='float32')
        mjd=np.array(i.get('mjd'), dtype='float32')
        r,c=mjd.shape
        mjd.reshape(c,)
        
        band=np.array(  i.get('passband'), dtype='int32').reshape(c,) # passband
        flux=np.array( i.get('flux'), dtype='float32').reshape(c,) # flux
        flux_err=np.array( i.get('flux_err'), dtype='float32').reshape(c,) # flux_err
        detected=np.array( i.get('detected'), dtype='int32').reshape(c,) # Detected

        
        
        #band=np.array(  j[0][1], dtype='int32').reshape(c,) # passband
        #flux=np.array( j[0][2], dtype='float32').reshape(c,) # flux
        #flux_err=np.array( j[0][3], dtype='float32').reshape(c,) # flux_err
        #detected=np.array( j[0][4], dtype='int32').reshape(c,) # Detected

        mjd -= mjd[0]
        mjd /= 100 # Earth time shift in day*100
        mjd /= (z + 1) # Object time shift in day*100


        received_wavelength = passbands[band] # Earth wavelength in nm
        received_freq = 300000 / received_wavelength # Earth frequency in THz
        source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm


        sample['band'] = band + 1

        sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
        sample['hist'][:,0] = mjd
        sample['hist'][:,1] = flux
        sample['hist'][:,2] = flux_err
        sample['hist'][:,3] = detected

        sample['hist'][:,6] = (source_wavelength/1000)
        sample['hist'][:,7] = (received_wavelength/1000)

        set_intervals(sample)

        flux_max = np.max(flux)
        flux_min = np.min(flux)
        flux_pow = math.log2(flux_max - flux_min)
        sample['hist'][:,1] /= math.pow(2, flux_pow)
        sample['hist'][:,2] /= math.pow(2, flux_pow)
        sample['meta'][9] = flux_pow / 10

        samples.append(sample)

        if len(samples) % 1000 == 0:
            print('Converting data {0}'.format(len(samples)), end='\r')

        if len(samples) >= limit:
            break



    print()
    return samples

In [17]:
def train_model(i, samples_train, samples_valid):
    start_augment=time.time()
    start_augmentCpu=time.clock()
    
    #samples_train += augmentate(samples_train, augment_count, augment_count)
    
    elapsed_augment=time.time() - start_augment
    elapsed_augmentCpu=time.clock() - start_augmentCpu

    patience = 1000000 // len(samples_train) + 5

    start_trainingVectors=time.time()
    start_trainingVectorsCpu=time.clock()

    train_x, train_y = get_keras_data(samples_train)

    elapsed_training_Vectors=time.time() - start_trainingVectors
    elapsed_training_VectorsCpu=time.clock() - start_trainingVectorsCpu

    print(len(samples_train))
    
    del samples_train
    
    start_validationVectors=time.time()
    start_validationVectorsCpu=time.clock()

    valid_x, valid_y = get_keras_data(samples_valid)
    del samples_valid
    
    elapsed_validation_Vectors=time.time() - start_validationVectors
    elapsed_validation_VectorsCpu=time.clock() - start_validationVectorsCpu
    
    return  elapsed_augment,elapsed_augmentCpu,\
            elapsed_training_Vectors,elapsed_training_VectorsCpu,\
            elapsed_validation_Vectors,elapsed_validation_VectorsCpu, \
            train_x, train_y

    
    ## THIS IS AS FAR AS WE NEED TO GO FOR THIS TEST


In [33]:
sqlContext.sql("REFRESH TABLE full_training_pivot")

DataFrame[]

In [34]:
print('Loading train data from hive...')

start_train=time.time()
start_trainCpu=time.clock()

raw_vectorsSQL="""
select 
    ts.object_id,
    md.target,
    array(md.ddf, md.hostgal_specz, md.hostgal_photoz, md.hostgal_photoz_err, md.mwebv, md.distmod) as meta,
    ts.mjd,
    ts.passband,
    ts.flux,
    ts.flux_err,
    ts.detected
from full_training_pivot ts
    inner join training_set_metadata md
        on ts.object_id = md.object_id
"""
raw_vectorsDF=sqlContext.sql(raw_vectorsSQL)

elapsed_train=time.time()-start_train
elapsed_trainCpu=time.clock()-start_trainCpu
wtable = get_wtable(raw_vectorsDF)

Loading train data from hive...


In [35]:
raw_vectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- meta: array (nullable = false)
 |    |-- element: double (containsNull = true)
 |-- mjd: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- passband: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)
 |-- flux: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- flux_err: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- detected: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)



In [None]:
print(elapsed_train)
print(elapsed_trainCpu)

In [36]:
start_samples=time.time()
start_samplesC=time.clock()
    
samples =  get_data(raw_vectorsDF, \
                    extragalactic=None, use_specz=use_specz)

elapsed_samples=time.time() - start_samples
elapsed_samplesC=time.clock() - start_samplesC

6387941
102096317
253105
33860513
122628241
24751465
18749173
55976189
179877
109197025
96975445
10716233
19213
302689
117461
300441
197585
10115621
104381461
8629629
38407729
158573
7779381
97011617
16598333
320141
89980429
42689
116451501
85113493
76337325
19020589
115157
113914289
73489449
65562285
94514485
15727305
103851441
38832257
94954841
28853121
7409
298585
190577
109026809
50669045
92253169
61099873
270085
11160105
108055321
113074005
200161
82517333
19402817
151973
30545
77503441
42361501
41259789
128405
289945
27941
33424129
19496909
44149901
97957
68550949
271893
104397
68843297
77559633
235141
48839513
78729325
92798777
115213621
185785
315765
43345841
91337
23097889
9815437
40938993
44494697
106520041
24460057
88444541
58469213
47958757
35901933
103145
118968385
19557017
54883621
126386017
1594469
112156585
275009
64485
12147053
68656761
21330509
177697
31605
213157
119220981
42638097
104838165
207745
3883673
85790549
334749
323153
2204069
89733353
312453
67726657
16504

108422825
17183361
61749689
25351681
196221
98533
2110685
204989
66594085
37865
84357869
88404289
236205
129023853
65230573
262081
105164597
11025469
220685
52265845
116166781
289657
82160413
23623153
115053
55141
75090517
33037345
205449
22767201
114948753
75836645
72943445
57561
128369913
95810605
127374613
1904697
63637101
129845065
57205
19153333
110720057
117393
2317409
102605941
98038305
102692257
9215549
38380125
67256001
111245629
16907701
312565
57563181
110769373
30579629
88657813
106218977
88848741
101631565
101023493
117528645
340017
126347345
100097
229669
128389017
71627745
290589
96240657
74987849
280917
36938257
108771817
319093
24989
53673297
306349
110285813
126791777
25010781
74643673
108542869
257209
115771765
194509
72489
104396329
27222589
28978765
112060173
66586389
60983185
81448561
44578293
18457361
2352593
14698897
25529
11773
232681
76536433
115937
72337
42821845
188609
50217109
56654981
67112741
98118937
225529
43337
89911813
21635777
75757037
11688781
47356

7698
122596778
57013110
298798
112215886
27418086
19451710
128089118
92564906
7483838
293898
269814
333434
119182558
109242918
23067542
5237534
18460946
282638
76986398
118289958
39708342
305958
11415902
147214
198434
44637346
149130
54475110
99862
126374794
62604446
123618670
111660438
20148786
36689698
34750342
86934478
84758
8684410
3754758
81671326
43479814
283066
6927138
85144130
125242
81274374
318146
104498
56002762
266762
84773554
230922
106223882
52383458
16780730
124218782
93344982
130739978
70430
63051618
110099058
35463750
155110
125696158
108733170
119031102
90188238
40717486
80554090
14650534
115638
8667146
67211738
329326
129205942
115209058
237146
70541274
43933894
86490974
50429346
77632410
123737006
21169750
82157690
106903418
63777094
83704874
64854
83229482
129490
53808158
324114
335238
298802
77298174
61612630
279734
116920810
63468554
91705234
105107266
44277346
41764022
151694
95566
29838870
105767334
97725738
8834794
254618
111327134
58361162
116872334
25760694


186462
115969914
112415474
177970
84871886
71066710
96226194
123045682
233950
97906234
45461118
52916834
30978418
18703334
7532482
108160238
179834
296390
30431518
45939002
2056114
84010854
90321306
75093886
57043450
3166910
7566
113904322
4364990
260498
3419558
155778
73501938
109233382
101374
5174610
5648366
83634
86682986
67600070
108659894
116418066
87808950
266014
82447262
47348026
9688590
183210
5343694
88443230
291834
56539682
102772502
184530
58006418
111353970
9373678
128240086
69490
91994218
39488530
116818
129828542
77864390
28016462
38754
13850906
104468298
47726502
3829378
72364882
90562518
20447182
5222442
38568274
103162
122233366
290954
62230846
273326
40647942
61044710
118446342
44885482
730
25094602
50941150
61086594
14379202
116570
119506614
41464750
28282774
115647298
51232774
8186606
42366062
129937154
61220354
1598
8998750
81613006
51578826
3902206
126411394
14674
99638726
61994170
70702402
53647938
203758
136110
10331074
109786118
222506
118545866
206766
37208382

188831
23165475
17064403
22173531
8247543
35593703
172231
61069263
256899
246511
40246443
83024891
193431
73815467
123360355
8198259
86853599
96628879
104808447
101633495
129135543
72049251
103817167
206547
96338147
67962803
26839367
105653011
191847
80358295
76639
110444195
22813375
323155
118211
108716871
17865751
3745099
21539355
117018439
59183655
69767
35618243
9978123
18648923
88413067
114191
25003
35114427
43151
62517295
83071395
61700731
226855
109431067
42258259
52440447
95147
7190427
101679595
110448747
301907
99227595
41056547
215339
115915799
49195651
337743
115557395
91772215
23318423
177475
79754147
225919
113349371
56308175
239227
108487
4171
45516859
117655679
90399
22844503
128967
56593559
29011171
118291075
67616019
129175387
102551107
118285479
26808295
105130907
59498091
75375051
51411523
30558039
84755147
119215
83969535
254147
106363739
76929031
2298767
215159
102748747
116410559
313359
83084707
55724631
4830587
27840303
79743
113829239
95449339
88041263
75319011


67108107
71323295
121504515
311219
76950011
90105327
121765263
86705407
42585719
40162347
67279967
17596103
71620283
129296519
88287927
27047619
34976571
126441371
126866587
1952799
83195879
252115
103854959
20234539
33086563
49626003
69070587
75093083
30191
12138847
111601371
81078691
130678775
56651915
19841339
11810023
118955467
91335
113973531
119117463
55439147
108374759
125238759
121274615
107439
11284955
259447
115319335
14279
16477535
89455
102823
105206255
25110491
9203
127570499
125234535
142099
106471435
72796451
52740863
66999
51496907
83423091
67089327
48734543
97120903
12695
102417599
123367315
78199831
38075415
91291
32141863
98507931
101806291
128391675
123849855
41874247
43392755
38092995
65788059
48725835
128992591
126262179
117856179
27807303
7044535
289303
58323
65333415
13868291
1408779
95127
80863915
41356823
113044327
122129863
136407
26004183
53105135
28334351
15494091
239835
294059
105502451
299747
90222007
15541983
222095
111575039
45115
251079
199479
41776135

74618084
71068
9383516
46609480
110548280
239080
23848
86316940
117577580
253072
102560576
130188020
9936
94028932
112838400
123394632
91988
91420500
13698236
33827660
84716
263344
85780680
332684
38272236
128699652
290676
109863612
91767660
21425008
42999352
101363984
39331296
4311960
72391044
64576696
299436
68276
92113420
81449512
65427508
273804
20272112
59517412
157120
208704
31965152
10796
81833228
62384
80913604
341032
22720952
70055532
65978932
288128
89996660
106540496
75792
14619652
93834540
15723500
20722020
128167460
112109284
126564408
87945160
109178080
15741784
19319284
129335528
50120780
167488
11574036
76454672
88651372
19029976
67956884
63618392
128083908
130108924
100127808
38775504
37776
173708
46755264
79374000
95336452
118970396
44988848
52320160
67486956
83278224
259884
117104
119388820
122756636
35973028
20200052
77340
37465736
212064
97288684
74196652
265536
56021724
120661580
91022592
28007244
330300
123434604
91836804
95680720
202260
114633360
109318420
60374

In [None]:
from pyspark.sql.functions import lit, col
id=615

In [37]:
samples

[{'id': 6387941,
  'target': 11,
  'meta': array([0.       , 0.       , 0.       , 0.       , 0.       , 0.5737   ,
         0.014    , 0.031    , 1.       , 0.6446143], dtype=float32),
  'specz': 0.5788000226020813,
  'band': array([6, 4, 3, 3, 5, 4, 5, 2, 5, 5, 6, 6, 6, 4, 1, 1, 5, 4, 5, 6, 3, 3,
         5, 5, 3, 4, 4, 1, 4, 6, 5, 5, 6, 5, 1, 2, 6, 6, 6, 6, 5, 1, 5, 3,
         2, 4, 4, 4, 3, 5, 5, 3, 3, 3, 3, 6, 1, 2, 5, 2, 6, 5, 1, 4, 3, 4,
         4, 6, 5, 4, 1, 3, 6, 5, 4, 6, 5, 6, 3, 5, 5, 6, 2, 2, 5, 5, 1, 1,
         4, 6, 3, 4, 5, 5, 5, 2, 3, 3, 6, 4, 6, 6, 6, 4, 3, 1, 3, 6, 4, 6,
         6, 4, 2, 3, 1, 3, 5, 4, 4, 3, 2, 6, 1, 5, 5, 6, 5, 5, 6, 6, 5, 1],
        dtype=int32),
  'hist': array([[ 0.        ,  0.01689852,  0.28811815, ...,  0.        ,
           0.63798696,  1.004     ],
         [ 0.        ,  0.13304652,  0.07837378, ...,  0.        ,
           0.47912565,  0.754     ],
         [ 0.        ,  0.03698028,  0.0370082 , ...,  0.        ,
           0.394611

In [None]:
np.array( (raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect()) )

In [None]:
#train_mjd_data=raw_vectorsDF.select('hist.mjd').filter(col("object_id") == lit(id)).toPandas()
start=time.time()
band=np.array(raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect() , dtype='int32')
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
start=time.time()
band=raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).toPandas()
band=np.array(band)
print(band.dtype)
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
band

In [None]:
#train_mjd_data=raw_vectorsDF.select('hist.mjd').filter(col("object_id") == lit(id)).toPandas()
start=time.time()
band=np.array(raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect().toPandas() , dtype='int32') #.reshape(c,)
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
x=raw_vectorsDF.select('hist.mjd').toPandas()

In [None]:
mjd.shape


In [None]:
print(elapsed_samples)
print(elapsed_samplesC)

In [None]:
for i in range(1, num_models+1):

    samples_train, samples_valid = train_test_split(samples, test_size=valid_size, random_state=42*i)
    len(samples_train)
    
    start_train=time.time()
    elapsed_augment,elapsed_augmentCpu,\
            elapsed_training_Vectors,elapsed_training_VectorsCpu,\
            elapsed_validation_Vectors,elapsed_validation_VectorsCpu, \
            train_x, train_y = \
            train_model(i, samples_train, samples_valid)
    elapsed_train=time.time()-start_train
    print(elapsed_train)
    #break

In [None]:
samples_train, samples_valid = train_test_split(samples, test_size=valid_size, random_state=42*1)

In [None]:
train_x, train_y = get_keras_data(samples_train)

In [None]:
X=train_x
Y=train_y

In [None]:
shape=X['hist'][0].shape
shape

In [None]:
X['meta'][0].shape

In [None]:
hist_input = Input(shape=X['hist'][0].shape, name='hist')
meta_input = Input(shape=X['meta'][0].shape, name='meta')
band_input = Input(shape=X['band'][0].shape, name='band')

In [None]:
band_emb = Embedding(8, 8)(band_input)

In [None]:
hist_input

In [None]:
band_emb

In [None]:
hist = concatenate([hist_input, band_emb])
hist = TimeDistributed(Dense(40, activation='relu'))(hist)


In [None]:
hist

# Stuff below is final vector analysis

In [None]:
raw_vectorsDF.count()

In [None]:
list_persons

#### Converting dataframes to dictionaries
https://stackoverflow.com/questions/41206255/convert-pyspark-sql-dataframe-dataframe-type-dataframe-to-dictionary

In [None]:
list_objects = map(lambda row: row.asDict(), raw_vectorsDF.collect())

In [None]:
object_vectors = {object['object_id']: object for object in list_objects}

In [None]:
samples=[]

In [None]:
use_specz=False

for key in object_vectors.keys():
    print(key)
    i=object_vectors.get(key)
    
    id=i.get('object_id')

    sample = {}
    sample['id'] = int(id)
    
    # 'object_id', 'target', 'meta', 'specz', 'band', 'hist'
    
    sample['target'] = int(i.get('target'))
    
    meta=np.array(i.get('meta'), dtype='float32')
  
    sample['meta'] = np.zeros(10, dtype = 'float32')

    sample['meta'][4] = meta[0]
    sample['meta'][5] = meta[2]
    sample['meta'][6] = meta[3]
    sample['meta'][7] = meta[4]
    sample['meta'][8] = float(meta[2]) > 0

    sample['specz'] = float(meta[1])    
    
    if use_specz:
        sample['meta'][5] = float(meta['hostgal_specz'])
        sample['meta'][6] = 0.0

    z = float(sample['meta'][5])

    j=i.get('hist')
    mjd=np.array(j[0][0], dtype='float32')
    r,c=mjd.shape
    mjd.reshape(c,)
    band=np.array(  j[0][1], dtype='int32').reshape(c,) # passband
    flux=np.array( j[0][2], dtype='float32').reshape(c,) # flux
    flux_err=np.array( j[0][3], dtype='float32').reshape(c,) # flux_err
    detected=np.array( j[0][4], dtype='int32').reshape(c,) # Detected

    mjd -= mjd[0]
    mjd /= 100 # Earth time shift in day*100
    mjd /= (z + 1) # Object time shift in day*100


    received_wavelength = passbands[band] # Earth wavelength in nm
    received_freq = 300000 / received_wavelength # Earth frequency in THz
    source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm


    sample['band'] = band + 1

    sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
    sample['hist'][:,0] = mjd
    sample['hist'][:,1] = flux
    sample['hist'][:,2] = flux_err
    sample['hist'][:,3] = detected

    sample['hist'][:,6] = (source_wavelength/1000)
    sample['hist'][:,7] = (received_wavelength/1000)

    set_intervals(sample)
    
    flux_max = np.max(flux)
    flux_min = np.min(flux)
    flux_pow = math.log2(flux_max - flux_min)
    sample['hist'][:,1] /= math.pow(2, flux_pow)
    sample['hist'][:,2] /= math.pow(2, flux_pow)
    sample['meta'][9] = flux_pow / 10
    
    samples.append(sample)

    if len(samples) % 1000 == 0:
        print('Converting data {0}'.format(len(samples)), end='\r')

    if len(samples) >= limit:
        break
    
  

In [None]:
target

In [None]:
 np.where(classes == int(target))[0][0] # positional index of the classes array

In [None]:
i=object_vectors.get(615)

In [None]:
a[[0,1,3], :][:, [0,2]]  # Selects the columns you want as well

In [None]:
meta=meta=np.array(i.get('meta'), dtype='float32')
meta

In [None]:
meta[4]

In [None]:
j[0][0]

In [None]:
i=dict_persons.get(6266)

In [None]:
dict_persons.values()

In [None]:
i

In [None]:
[*i]

In [None]:
for key in i.keys():
  print(key)

In [None]:
i.get('object_id')

In [None]:
np.array(i.get('band')).shape

In [None]:
j=i.get('hist')

In [None]:
j

In [None]:
for row, sublist in enumerate(j):
    for column, item in enumerate(sublist):
        if item:
            print((row, column))
            


In [None]:
j[0][0] -- mjd

In [None]:
j[0][1] # passband

In [None]:
j[0][2] # flux

In [None]:
j[0][3] # flux_err

In [None]:
j[0][4] # Detected

In [None]:
for row, sublist in enumerate(k):
    for column, item in enumerate(sublist):
        if item:
            print((row, column))
            

In [None]:
k[0][1]

In [None]:

print(train_x['id'].shape)
print(train_x['meta'].shape)
print(train_x['band'].shape)
print(train_x['hist'].shape)

OK, so for hist - each row, we have 8 arrays palanced out to 256 entries - pad stuff! and the arrays are 0 mjd 1 flux 2 flux_err 3 etc 4 etc 5 etc 6 7

so 

- train_x['hist'][0,:,0] is mjd
- train_x['hist'][0,:,1] is flux
- train_x['hist'][0,:,2] is flux_err
- train_x['hist'][0,:,3] is detected
- train_x['hist'][0,:,4] is mjd_deltas
- train_x['hist'][0,:,5] is mjd_reverse_deltas
- train_x['hist'][0,:,6] is source_wavelength
- train_x['hist'][0,:,7] is received_wavelength

Just remember that in get_keras_data, mjd, detected and received_wavelength values are removed and set to zero.

In [None]:
train_x['id'][0]

In [None]:
train_x['hist'][0,:,4]

In [None]:
mjd=[1,2,4,8,16,32,64]

mjd1=np.ediff1d(mjd, to_begin = [0]) - mjd deltas between elements
mjd2=np.ediff1d(mjd, to_end = [0]) - reverse mjd from end

#hist[:,4] = np.ediff1d(hist[:,0], to_begin = [0])
#hist[:,5] = np.ediff1d(hist[:,0], to_end = [0])

In [None]:
fullSetSQL="""
select * from training_set_all_padded
"""

In [None]:
fullDF =sqlContext.sql(fullSetSQL)

In [None]:
fullDF.printSchema()

In [None]:
fullDF.count()

In [None]:
X = {
        'id': np.array(fullDF.select('object_id').collect(), dtype='int32'),
        'meta': np.array(fullDF.select('meta').collect(), dtype='float32'),
        'band': np.array(fullDF.select('band').collect() , dtype='int32').reshape(7848,256)
#        'hist': pad_sequences([i['hist'] for i in itemslist], maxlen=sequence_len, dtype='float32'),
    }
#print('creating Y')
#Y = to_categorical([i['target'] for i in itemslist], num_classes=len(classes))

In [None]:
X['id'].shape

In [None]:
histTest=np.array(fullDF.select('hist.interval').collect(), dtype='float32').reshape(7848,256)

In [None]:
histTest.shape

In [None]:
fluxTest=np.array(fullDF.select('hist.flux').collect(), dtype='float32').reshape(7848,256)
flux_err_test=np.array(fullDF.select('hist.flux_err').collect(), dtype='float32').reshape(7848,256)

In [None]:
flux_err_test.shape

In [None]:
histArray=np.zeros((7848,256,8), dtype='float32') 
# this will work brilliantly as get_keras_data sets three columns to zeros anyway

In [None]:
histArray[:,:,0]=histTest
histArray[:,:,1]=fluxTest
histArray[:,:,1]=flux_err_test

In [None]:
histArray.shape

In [None]:
targetArray=fluxTest=np.array(fullDF.select('target').collect(), dtype='int32')

In [None]:
targetArray.shape

In [None]:
classes

In [None]:
len(classes)

In [None]:
np.where(classes == 6)

In [None]:
Y = to_categorical(3, num_classes=len(classes))

In [None]:
Y

In [None]:
fullDF.select('meta').collect()

# Experimental - testing mods for get data below

In [None]:
extragalactic=None

In [None]:
samples = []
# You have to perform an aggregation on the Spark dataframe and collect the results before you can iterate 
# This i not necessary with pandas
groups = train_mdf_data.groupby('object_id')


In [None]:
for g in groups:
    id=g[0]
    
    sample = {}
    sample['id'] = int(id)
    
    meta = train_meta.loc[train_meta['object_id'] == id]
    
    # NEW We need to get the arrays for each pivoted dataframe
    #mjd = train_mjd_data.loc[train_mjd_data['object_id'] == id]
    #passband=train_passband_data.loc[train_passband_data['object_id'] == id]
    #flux=train_flux_data.loc[train_flux_data['object_id'] == id]
    #flux_err=train_flux_err_data.loc[train_flux_err_data['object_id'] == id]
    #detected=train_detected_data.loc[train_detected_data['object_id'] == id]
    
    if extragalactic == True and float(meta['hostgal_photoz']) == 0:
        continue

    if extragalactic == False and float(meta['hostgal_photoz']) > 0:
        continue    
    
    if 'target' in meta:
        sample['target'] = np.where(classes == int(meta['target']))[0][0]
    else:
        sample['target'] = len(classes) - 1   

    sample['meta'] = np.zeros(10, dtype = 'float32')

    sample['meta'][4] = meta['ddf']
    sample['meta'][5] = meta['hostgal_photoz']
    sample['meta'][6] = meta['hostgal_photoz_err']
    sample['meta'][7] = meta['mwebv']
    sample['meta'][8] = float(meta['hostgal_photoz']) > 0

    sample['specz'] = float(meta['hostgal_specz'])

    
    if use_specz:
        sample['meta'][5] = float(meta['hostgal_specz'])
        sample['meta'][6] = 0.0

    z = float(sample['meta'][5])
    
    # we need to drop the object_id from the pivot records. We can use any of the pivot dataframes,
    # because they all have the same shape, coming from a Hive table definition. We'll use the MJD
    # pivot dataframe to set up the indexes we want. How we do this - in stages
    # 1. Create a data frame for the pivot dataframes, on for each object_id
    # 2. Use dropna to remove NAN column values
    # 3. Cast that dataframe to a numpy array and get the shape
    # 4. Use the mjd array as a base, create an index list of the columns we want - ie we're dropping the object_id
    # 5. Use the index ro truncate the object_id column from the rest of the arrays
    # 6. finally, we need to reshape the arrays from [1;cols] to [cols,]   
    
    mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')
    r,c=mjd.shape

    idx_OUT_columns = [0]
    idx_IN_columns = [i for i in range(np.shape(mjd)[1]) if i not in idx_OUT_columns]

    mjd = mjdArray[:,idx_IN_columns].reshape(c-1,)
    band = np.array(train_passband_data.loc[train_passband_data['object_id'] == id].dropna(axis='columns') , dtype='int32')[:,idx_IN_columns].reshape(c-1,)
    flux = np.array(train_flux_data.loc[train_flux_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    flux_err = np.array(train_flux_err_data.loc[train_flux_err_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    detected = np.array(train_detected_data.loc[train_detected_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    #mjd      = np.array(g[1]['mjd'],      dtype='float32')
    #band     = np.array(g[1]['passband'], dtype='int32')
    #flux     = np.array(g[1]['flux'],     dtype='float32')
    #flux_err = np.array(g[1]['flux_err'], dtype='float32')
    #detected = np.array(g[1]['detected'], dtype='float32')  

    
    mjd -= mjd[0]
    mjd /= 100 # Earth time shift in day*100
    mjd /= (z + 1) # Object time shift in day*100

    
    received_wavelength = passbands[band] # Earth wavelength in nm
    received_freq = 300000 / received_wavelength # Earth frequency in THz
    source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm

    
    sample['band'] = band + 1

    sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
    sample['hist'][:,0] = mjd
    sample['hist'][:,1] = flux
    sample['hist'][:,2] = flux_err
    sample['hist'][:,3] = detected

    sample['hist'][:,6] = (source_wavelength/1000)
    sample['hist'][:,7] = (received_wavelength/1000)
    
    set_intervals(sample)


    flux_max = np.max(flux)
    flux_min = np.min(flux)
    flux_pow = math.log2(flux_max - flux_min)
    sample['hist'][:,1] /= math.pow(2, flux_pow)
    sample['hist'][:,2] /= math.pow(2, flux_pow)
    sample['meta'][9] = flux_pow / 10
    
    samples.append(sample)

    if len(samples) % 1000 == 0:
        print('Converting data {0}'.format(len(samples)), end='\r')

    if len(samples) >= limit:
        break


In [None]:
sample

In [None]:
id=713
print(id)
sample = {}
sample['id'] = int(id)

meta = train_meta.loc[train_meta['object_id'] == id]

# NEW We need to get the arrays for each pivoted dataframe
#mjd = train_mjd_data.loc[train_mjd_data['object_id'] == id]
#passband=train_passband_data.loc[train_passband_data['object_id'] == id]
#flux=train_flux_data.loc[train_flux_data['object_id'] == id]
#flux_err=train_flux_err_data.loc[train_flux_err_data['object_id'] == id]
#detected=train_detected_data.loc[train_detected_data['object_id'] == id]

if extragalactic == True and float(meta['hostgal_photoz']) == 0:
    print('Hi there')

if extragalactic == False and float(meta['hostgal_photoz']) > 0:
    print('Hi there again')  

if 'target' in meta:
    sample['target'] = np.where(classes == int(meta['target']))[0][0]
else:
    sample['target'] = len(classes) - 1   

sample['meta'] = np.zeros(10, dtype = 'float32')

sample['meta'][4] = meta['ddf']
sample['meta'][5] = meta['hostgal_photoz']
sample['meta'][6] = meta['hostgal_photoz_err']
sample['meta'][7] = meta['mwebv']
sample['meta'][8] = float(meta['hostgal_photoz']) > 0

sample['specz'] = float(meta['hostgal_specz'])


if use_specz:
    sample['meta'][5] = float(meta['hostgal_specz'])
    sample['meta'][6] = 0.0

z = float(sample['meta'][5])

# we need to drop the object_id from the pivot records. We can use any of the pivot dataframes,
# because they all have the same shape, coming from a Hive table definition. We'll use the MJD
# pivot dataframe to set up the indexes we want. How we do this - in stages
# 1. Create a data frame for the pivot dataframes, on for each object_id
# 2. Use dropna to remove NAN column values
# 3. Cast that dataframe to a numpy array and get the shape
# 4. Use the mjd array as a base, create an index list of the columns we want - ie we're dropping the object_id
# 5. Use the index ro truncate the object_id column from the rest of the arrays
# 6. finally, we need to reshape the arrays from [1;cols] to [cols,],
mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')
r,c=mjd.shape

idx_OUT_columns = [0]
idx_IN_columns = [i for i in range(np.shape(mjd)[1]) if i not in idx_OUT_columns]

mjd = mjdArray[:,idx_IN_columns].reshape(c-1,)
band = np.array(train_passband_data.loc[train_passband_data['object_id'] == id].dropna(axis='columns') , dtype='int32')[:,idx_IN_columns].reshape(c-1,)
flux = np.array(train_flux_data.loc[train_flux_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
flux_err = np.array(train_flux_err_data.loc[train_flux_err_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
detected = np.array(train_detected_data.loc[train_detected_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)

#mjd      = np.array(g[1]['mjd'],      dtype='float32')
#band     = np.array(g[1]['passband'], dtype='int32')
#flux     = np.array(g[1]['flux'],     dtype='float32')
#flux_err = np.array(g[1]['flux_err'], dtype='float32')
#detected = np.array(g[1]['detected'], dtype='float32')  

# Now we need to reshape to columns
#mjd=mjd.reshape(352,)

mjd -= mjd[0]
mjd /= 100 # Earth time shift in day*100
mjd /= (z + 1) # Object time shift in day*100




In [None]:

received_wavelength = passbands[band] # Earth wavelength in nm
received_freq = 300000 / received_wavelength # Earth frequency in THz
source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm

In [None]:
hiThere = train_mjd_data.loc[train_mjd_data['object_id'] == id]

In [None]:
train_mjd_data.

In [None]:
hiThere

In [None]:
hiThere.dropna(axis='columns')

In [None]:
mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')

In [None]:
r,c=mjd.shape

In [None]:
r

In [None]:
c