In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import math
import copy
import random
import time
import sys

from pyspark import SparkConf,SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors

In [2]:
from keras.layers import *
from keras.models import Model, load_model
from keras.optimizers import Adam, Nadam, SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix

In [4]:
import tensorflow as tf

In [5]:
sqlContext = SQLContext(sc)

In [6]:
sqlContext.sql("use plasticc")

DataFrame[]

In [7]:
augment_count = 25
batch_size = 1000
batch_size2 = 5000
optimizer = 'nadam'
num_models = 1
use_specz = False
valid_size = 0.1
max_epochs = 1000

limit = 1000000
sequence_len = 256

In [8]:
classes = np.array([6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95, 99], dtype='int32')
class_names = ['class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']
class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1, 99: 1}

# LSST passbands (nm)  u    g    r    i    z    y      
passbands = np.array([357, 477, 621, 754, 871, 1004], dtype='float32')

In [9]:
def append_data(list_x, list_y = None):
    X = {}
    for k in list_x[0].keys():

        list = [x[k] for x in list_x]
        X[k] = np.concatenate(list)

    if list_y is None:
        return X
    else:
        return X, np.concatenate(list_y)

In [10]:
def get_wtable(df):
    #x=np.array(raw_vectorsDF.select('target').collect())
    
    all_y = np.array(df.select('target').collect(), dtype = 'int32')

    y_count = np.unique(all_y, return_counts=True)[1]

    wtable = np.ones(len(classes))

    for i in range(0, y_count.shape[0]):
        wtable[i] = y_count[i] / all_y.shape[0]

    return wtable

In [11]:
def get_keras_data(itemslist):

    keys = itemslist[0].keys()
    print('creating X')
    X = {
            'id': np.array([i['id'] for i in itemslist], dtype='int32'),
            'meta': np.array([i['meta'] for i in itemslist]),
            'band': pad_sequences([i['band'] for i in itemslist], maxlen=sequence_len, dtype='int32'),
            'hist': pad_sequences([i['hist'] for i in itemslist], maxlen=sequence_len, dtype='float32'),
        }
    print('creating Y')
    Y = to_categorical([i['target'] for i in itemslist], num_classes=len(classes))

    X['hist'][:,:,0] = 0 # remove abs time
#    X['hist'][:,:,1] = 0 # remove flux
#    X['hist'][:,:,2] = 0 # remove flux err
    X['hist'][:,:,3] = 0 # remove detected flag
#    X['hist'][:,:,4] = 0 # remove fwd intervals
#    X['hist'][:,:,5] = 0 # remove bwd intervals
#    X['hist'][:,:,6] = 0 # remove source wavelength
    X['hist'][:,:,7] = 0 # remove received wavelength

    return X, Y

In [12]:
def set_intervals(sample):

    hist = sample['hist']
    band = sample['band']

    hist[:,4] = np.ediff1d(hist[:,0], to_begin = [0])
    hist[:,5] = np.ediff1d(hist[:,0], to_end = [0])

In [13]:
def copy_sample(s, augmentate=True):
    c = copy.deepcopy(s)

    if not augmentate:
        return c

    band = []
    hist = []

    drop_rate = 0.3

    # drop some records
    for k in range(s['band'].shape[0]):
        if random.uniform(0, 1) >= drop_rate:
            band.append(s['band'][k])
            hist.append(s['hist'][k])

    c['hist'] = np.array(hist, dtype='float32')
    c['band'] = np.array(band, dtype='int32')

    set_intervals(c)
            
    new_z = random.normalvariate(c['meta'][5], c['meta'][6] / 1.5) # hostgal_photoz and hostgal_photoz_err
    new_z = max(new_z, 0)
    new_z = min(new_z, 5)

    dt = (1 + c['meta'][5]) / (1 + new_z) # hostgal_photoz
    c['meta'][5] = new_z

    # augmentation for flux
    c['hist'][:,1] = np.random.normal(c['hist'][:,1], c['hist'][:,2] / 1.5) # flux and flux_err

    # multiply time intervals and wavelength to apply augmentation for red shift
    c['hist'][:,0] *= dt
    c['hist'][:,4] *= dt
    c['hist'][:,5] *= dt
    c['hist'][:,6] *= dt

    return c

In [14]:
def normalize_counts(samples, wtable, augmentate):
    maxpr = np.max(wtable)
    counts = maxpr / wtable

    res = []
    index = 0
    for s in samples:

        index += 1
        print('Normalizing {0}/{1}   '.format(index, len(samples)), end='\r')

        res.append(s)
        count = int(3 * counts[s['target']]) - 1

        for i in range(0, count):
            res.append(copy_sample(s, augmentate))

    print()

    return res

In [15]:
def augmentate(samples, gl_count, exgl_count):

    res = []
    index = 0
    for s in samples:

        index += 1
        
        if index % 1000 == 0:
            print('Augmenting {0}/{1}   '.format(index, len(samples)), end='\r')

        count = gl_count if (s['meta'][8] == 0) else exgl_count

        for i in range(0, count):
            res.append(copy_sample(s))

    print()
    return res

In [16]:
def get_data(raw_vectors_df, extragalactic=None, use_specz=False):

    samples = []
    list_objects = map(lambda row: row.asDict(), raw_vectorsDF.collect())
    object_vectors = {object['object_id']: object for object in list_objects}


    for key in object_vectors.keys():
        print(key)
        i=object_vectors.get(key)

        id=i.get('object_id')

        sample = {}
        sample['id'] = int(id)

        # 'object_id', 'target', 'meta', 'specz', 'band', 'hist'

        #sample['target'] = int(i.get('target'))
        sample['target'] = np.where(classes == int(i.get('target')))[0][0] # positional index of the classes array

        meta=np.array(i.get('meta'), dtype='float32')

        sample['meta'] = np.zeros(10, dtype = 'float32')

            #sample['meta'][4] = meta['ddf']					from meta column array meta[0]
            #sample['meta'][5] = meta['hostgal_photoz']			from meta column array meta[2]
            #sample['meta'][6] = meta['hostgal_photoz_err']		from meta column array meta[3]
            #sample['meta'][7] = meta['mwebv']					from meta column array meta[4]
            #sample['meta'][8] = float(meta['hostgal_photoz']) > 0  returns True or false

            #sample['specz'] = float(meta['hostgal_specz'])		from meta column array meta[1]


        sample['meta'][4] = meta[0]
        sample['meta'][5] = meta[2]
        sample['meta'][6] = meta[3]
        sample['meta'][7] = meta[4]
        sample['meta'][8] = float(meta[2]) > 0

        sample['specz'] = float(meta[1])    

        if use_specz:
            sample['meta'][5] = float(meta['hostgal_specz'])
            sample['meta'][6] = 0.0

        z = float(sample['meta'][5])

        
        ### NOTE: If the Hive table has been created with this statement as the collect
        # 'mjd',                  array( collect_list(a.kv['mjd']) ),
        # You need to reshape it because the shape will come back (1, columns)
        # whereas if you do this collect_list(a.kv['mjd'])
        # You don't need to do the reshape.
        # The table used in this example, training_raw_vectors_unpadded_no_calcs was craeted with the
        # second context. See the creation notebook, 
        # "FINAL - Create Training Set Raw Vectors - no calculations, unpadded.ipynb"
        # Compare to the code we use for the table training_set_raw_vectors
        
        
        j=i.get('hist')
        mjd=np.array(j[0][0], dtype='float32')
        
        #mjd.reshape(c,)
        
        band=np.array(  j[0][1], dtype='int32') #.reshape(c,) # passband
        flux=np.array( j[0][2], dtype='float32') #.reshape(c,) # flux
        flux_err=np.array( j[0][3], dtype='float32') #.reshape(c,) # flux_err
        detected=np.array( j[0][4], dtype='int32') #.reshape(c,) # Detected

        mjd -= mjd[0]
        mjd /= 100 # Earth time shift in day*100
        mjd /= (z + 1) # Object time shift in day*100


        received_wavelength = passbands[band] # Earth wavelength in nm
        received_freq = 300000 / received_wavelength # Earth frequency in THz
        source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm


        sample['band'] = band + 1

        sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
        sample['hist'][:,0] = mjd
        sample['hist'][:,1] = flux
        sample['hist'][:,2] = flux_err
        sample['hist'][:,3] = detected

        sample['hist'][:,6] = (source_wavelength/1000)
        sample['hist'][:,7] = (received_wavelength/1000)

        set_intervals(sample)

        flux_max = np.max(flux)
        flux_min = np.min(flux)
        flux_pow = math.log2(flux_max - flux_min)
        sample['hist'][:,1] /= math.pow(2, flux_pow)
        sample['hist'][:,2] /= math.pow(2, flux_pow)
        sample['meta'][9] = flux_pow / 10

        samples.append(sample)

        if len(samples) % 1000 == 0:
            print('Converting data {0}'.format(len(samples)), end='\r')

        if len(samples) >= limit:
            break



    print()
    return samples

In [17]:
def train_model(i, samples_train, samples_valid):
    start_augment=time.time()
    start_augmentCpu=time.clock()
    
    #samples_train += augmentate(samples_train, augment_count, augment_count)
    
    elapsed_augment=time.time() - start_augment
    elapsed_augmentCpu=time.clock() - start_augmentCpu

    patience = 1000000 // len(samples_train) + 5

    start_trainingVectors=time.time()
    start_trainingVectorsCpu=time.clock()

    train_x, train_y = get_keras_data(samples_train)

    elapsed_training_Vectors=time.time() - start_trainingVectors
    elapsed_training_VectorsCpu=time.clock() - start_trainingVectorsCpu

    print(len(samples_train))
    
    del samples_train
    
    start_validationVectors=time.time()
    start_validationVectorsCpu=time.clock()

    valid_x, valid_y = get_keras_data(samples_valid)
    del samples_valid
    
    elapsed_validation_Vectors=time.time() - start_validationVectors
    elapsed_validation_VectorsCpu=time.clock() - start_validationVectorsCpu
    
    return  elapsed_augment,elapsed_augmentCpu,\
            elapsed_training_Vectors,elapsed_training_VectorsCpu,\
            elapsed_validation_Vectors,elapsed_validation_VectorsCpu, \
            train_x, train_y

    
    ## THIS IS AS FAR AS WE NEED TO GO FOR THIS TEST


In [18]:
print('Loading train data from hive...')

start_train=time.time()
start_trainCpu=time.clock()

raw_vectorsSQL="""
select * from training_raw_vectors_unpadded_no_calcs
"""

raw_vectorsDF=sqlContext.sql(raw_vectorsSQL)

elapsed_train=time.time()-start_train
elapsed_trainCpu=time.clock()-start_trainCpu
#wtable = get_wtable(raw_vectorsDF)

Loading train data from hive...


In [19]:
raw_vectorsDF.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- meta: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- specz: double (nullable = true)
 |-- band: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- hist: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- mjd: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- flux_err: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)
 |    |    |-- detected: array (nullable = true)
 |    |    |    |-- element: float (containsNull = true)



In [20]:
##### DEBUG CELL!!!
samples =  get_data(raw_vectorsDF, \
                    extragalactic=None, use_specz=use_specz)

113335


IndexError: tuple index out of range

In [21]:
list_objects = map(lambda row: row.asDict(), raw_vectorsDF.collect())
object_vectors = {object['object_id']: object for object in list_objects}

In [22]:
for key in object_vectors.keys():
    print(key)

113335
175907
116147363
86418911
93834540
115079
48687
57058720
96338147
39523895
26205971
46967013
23602620
244159
79044967
94410053
68535149
82246622
36842485
59463
100551111
35005260
77823479
4534339
276457
18706
61069263
14988203
127378756
21704379
77306
41941000
51139333
125743
22720952
64889871
22694163
4548783
185839
86331608
4132
101652534
45790897
207745
115319335
85490
62131858
237313
314331
281110
44168220
41498244
158241
331321
7550910
58112745
51726360
110750341
130062639
88690264
92113420
35935438
78705
40838478
105613898
67256001
45225317
46962483
4931712
112097340
38075415
36706333
29774433
81192693
304176
6231563
242991
50669045
84251956
53655507
106115608
36551205
107668196
82695182
105285783
329089
10691057
303024
99041421
14982938
48473
99523947
283565
18604382
37414787
19887645
80301338
118455
123737006
45115
122582076
106075180
198049
31116320
23822
92347595
99715839
79155
175105
25793380
61444402
28210173
67131527
47769908
74512775
329627
54679871
56927863
102599

99886885
118886900
284171
115634018
87166131
92577
97538098
56334
2041005
322627
65873581
35114427
78407037
8383041
20146105
122596778
121274615
19154
10840631
88287927
44637346
107448991
92412349
15758096
155613
41673236
44413022
82834152
8688
152453
128451231
213934
26714405
107807203
128527504
63743124
125232937
94949090
36153
100560232
37661
44867066
127624949
54098685
142099
107731534
75901991
127779559
32503295
71627745
82160413
119357431
129459169
83081946
75030174
86608376
62187
329562
161988
60868379
302209
1950799
6460481
18151280
6125178
1215304
15475
108794725
67691418
109968805
17064403
101619848
83071395
3166910
117393
151462
86318859
96715108
64218010
73815467
66209070
91836004
18952
90098804
19582731
284802
45319
45576088
80067866
85439795
8328
74893050
37436066
101460016
91270103
175824
90750519
87842606
66999
24499337
114805967
294235
41526429
54469285
238605
97725738
302887
29599343
41179699
10495026
61014952
291304
26531
325297
177475
2362087
94924634
78344937
53143

80804933
49207610
253633
120055806
44309
78741329
3363402
40622049
187783
56173869
288920
47148
111079035
66664197
334851
5562789
17963899
215085
80968556
108305965
158507
15733808
34750342
151973
121051157
59948995
4389
53856704
94273668
185605
46455748
235478
69727089
123996080
117500992
61763
49783859
9172
110609484
108156870
121182
75562
54475110
130595291
105456912
71954
118681647
29000971
64006420
7390884
44836
229781
111799
99593374
17370
245974
142867
84716
93009853
119264179
83016336
128439785
199909
85892144
68210498
114139223
123234980
13537416
65444493
18211253
46481362
267911
108749436
83227942
90180482
109477227
3679620
92758794
14708119
121536777
87248837
36585313
63718
68996599
130489916
74943698
181003
98224913
53665047
89278328
24105587
280954
246096
97529236
34161124
16775731
80010339
57043450
13868291
50957767
330252
6849093
45020328
23294332
90739539
86813047
272367
99227595
87216787
260564
92436396
107712
7779381
35337692
71126
92577210
107784644
43151
29241258
83

123693032
257526
27339
111997875
77655020
9240224
95744114
16660550
34401716
25039
33930630
102990543
124280399
88378172
14553
62590415
71299170
11697012
17183361
110418764
118048579
88492912
87631226
107193
157477
324114
28391
91291
14958598
62428500
86068216
22665718
19215948
135097
59580
3883673
82359069
63675288
112354774
48246321
42993568
170133
103244674
58582922
19496909
268977
12147053
30201771
62622592
75499040
130263372
4190006
128953634
20202637
106384945
8848545
87712686
99390548
83423091
340434
129687910
125238759
6556610
80719023
91219
155110
117510131
268745
282659
232356
129637
6616638
23539
77933029
284997
290676
64956496
11332773
78811388
66325
39414345
3945645
122303146
135054
75218022
77825
62352511
35197
13067683
108101311
213128
10586
112049038
73367509
27865279
79921
301369
41570932
114670
143651
29587925
165406
77447252
38573605
91610803
51832424
237982
126441371
45833275
119182558
80672768
328488
56021724
121440
70988624
54045253
61556769
67875060
62230846
9978

6103303
74484351
42857334
112886
83647217
120717385
89872627
123360355
128149778
62341610
51255215
17559797
53498362
30398950
90153329
119598768
233523
111518007
115647298
98658462
244228
179685
21514821
110852184
121615670
6534811
57813649
98325702
57249593
46804
35268410
88228019
7602456
258452
28238969
51493116
16439973
54063245
72283619
175185
35947297
24150823
110769373
16979142
250327
67713721
95595375
41462721
11225240
1306267
21920642
75333688
56837447
117018439
45015918
109516
273938
27753416
47784490
180224
31001297
97887024
77298174
41150627
33370936
29865365
94935361
101631565
28710479
160048
294841
56287702
62404564
120526977
115792
101298
161411
116418066
44638358
328060
54000531
115535088
275128
86001230
62698558
103539300
3829378
124964703
10343540
253844
271394
36659563
90663647
105830156
85645570
57372172
77632410
1904697
43211
332985
12666732
46741316
95866530
70611740
115989107
57958087
110570640
265397
80852
55025653
86237249
117528645
44358143
78209099
39079234
34

261542
74752477
191334
103995649
66221774
69214098
63333778
26385586
49620169
77149421
126262179
200949
38730
46329774
267638
56594928
266405
165821
89563521
21263072
26109323
25576261
102134564
65925566
8056979
188405
196121
9883365
94954841
91691453
85870170
42916400
17346850
35513155
51833345
90799016
18115589
152079
38775504
47767011
21539355
9184
209987
97759814
12009081
98462441
106057072
20004752
71740669
22573927
74994691
92929
47829157
85713794
335507
77746327
97406
230026
42082198
58361162
80644071
4988488
89325014
30534719
86658359
115053
54659129
8667146
85209628
79743
100017743
66385558
121016694
25094602
9810940
50996359
92114915
83066602
75349007
95284396
23221800
113914289
213496
64674300
73031
121069321
66103495
64021710
1135692
8917917
5936751
76835764
76804558
153539
258888
43434496
77010
55182054
64226016
235402
2574012
3715909
42638097
66548
102302473
27941
103854959
204697
67834423
58143636
195975
101376210
112061925
301956
2204069
100928881
102043319
97957
494268

33240187
97546811
10710732
65877
30431518
46292232
119035658
73610
2123138
129832862
47061218
29227269
306349
136949
28843
129336792
69703704
274913
55141
37830666
38174
43615371
78291971
261127
110721898
38754
301907
14279
70351780
261863
75186008
138553
46880215
20234539
269454
9688590
61134104
111271820
200061
3366417
102613703
43009792
27519637
5763072
95508
79286628
2317409
78256436
86490974
94229
306051
33652637
120356
90188238
105164118
129759341
110255618
11540263
34166
118053364
109233382
49389
232681
309772
45599848
42366062
331957
122335762
118446342
92728380
331996
64301948
34746698
7160570
278971
39894363
1228479
110270
106218977
84724459
243552
70816
154762
129329222
114680258
55614977
99507753
220000
126927279
14810920
107981524
44787296
2922
10120851
47954318
73693128
88723725
7312772
67589353
117656667
119547853
37358191
68550949
78691188
280437
115381923
9730644
69629671
102864430
51496907
75249509
34344225
73196956
52439522
98720151
79374000
301257
112215886
228011
2

In [23]:
i=object_vectors.get(113335)

In [24]:
id=i.get('object_id')

In [26]:
j=i.get('hist')

In [28]:
b=i.get('band')

In [29]:
band=np.array(i.get('band') , dtype='int32')

In [32]:
detected=np.array( j[0][3], dtype='int32')

#### DEBUG CELLS ABOVE!!

In [None]:
print(elapsed_train)
print(elapsed_trainCpu)

In [38]:
start_samples=time.time()
start_samplesC=time.clock()
    
samples =  get_data(raw_vectorsDF, \
                    extragalactic=None, use_specz=use_specz)
#mjd, mjdShape =  get_data(raw_vectorsDF, \
#                    extragalactic=None, use_specz=use_specz)


elapsed_samples=time.time() - start_samples
elapsed_samplesC=time.clock() - start_samplesC

77306
90399
108229
108888
120927
145675
156386
161591
172231
199112
230026
236106
264774
267911
278959
331321
338456
2960700
9839433
10208103
11415902
17200070
17346850
18633595
18914374
24531840
28710479
34215867
35901998
44544601
47123160
47767011
47982331
57813649
58244212
59822050
63356130
65031744
79602103
84451315
86944825
87268787
87939927
90390292
93673446
104481118
106195942
107488869
117237294
118664463
125696158
126078896
129045287
129773611
745
4132
4173
25003
36362
55419
77010
99280
109937
216087
222095
224205
226847
231903
246788
304176
339800
9248706
11194263
13457941
27840303
29933389
30293921
31307707
31884982
40057040
42566591
45227109
53345639
53627887
54774214
55417986
55828424
56826212
64301948
66837274
70425954
73045012
85466349
89872627
92728380
94098306
99906386
105433334
110609484
118291075
119178558
120864200
121456044
123049303
124702871
126616065
129073978
52854
78233
79235
87685
115079
176303
183283
193610
200471
205459
211092
233042
233524
246012
257007
29

319727
336032
10788634
11295412
13791759
22709656
26606820ng data 1000
26928003
30912889
30927482
32924992
35253243
41832206
42485979
44864629
45015918
56634166
60447309
60769248
62604446
64576696
67240327
69998442
81274374
91327966
97110605
98093823
99366334
105739240
106262435
108156870
108749436
111722655
118757157
119879213
120808014
130595291
1598
1632
6762
17172
111281
203203
235022
237982
252193
261823
291834
337180
2548197
5042647
5137359
8764806
9727233
9754761
11126708
12439100
19672685
23443064
23734911
25295827
25574478
38085252
38742819
41149470
41322940
42705243
46741316
47608756
48459874
52200369
53309853
66594085
74350461
75840214
88492912
94155215
96242023
97011617
106073135
113204830
118689867
121683577
123024210
126463327
77906
115937
128518
133354
138010
152300
167310
209796
260498
282659
302849
310436
335238
4378486
23942408
24344790
29000971
33900038
36033639
42821845
46054262
51452842
53070942
53540611
58755427
61381457
63031382
72242636
78242149
79712309
7990969

26314776
35935438
42606035
44277346
51055844
56376610
59600991
64466592
70185284
71521763
83370784
85439795
86635308
86821844
88459694
90614124
93686217
96490400
104723380
105891238
112930469
129705825
45060
73031
74093
89999
170753
186702
190784
211096
243303
257906
269283
273938
314331
1288208
5345351
24915102
28667400
28928626
32908557
33846439
34276621
45992610
49075963
50669045
51411523
51467589
59361580
65230573
75236513
77359158
79979417
85414569
87082980
90539476
91642065
95162381
106223882
109496670
113331521
115713701
115912282
120482980
120621144
126814460
129335528
7409
29668
81464
142099
161521
202856
235141
237600
239080
339008
3829378
13619009
15702122
15741784
19029976
29626182
30252332
30559210
33424129
35593703
37944944
40049430
42844044
44748946
45020328
46026984
46129949
55976189
56901403
58660846
66041568
67466710
68339955
75917809
76021241
78741329
92577210
102024375
102506887
103428710
106057072
106678802
110107673
111446272
124169939
129030512
129463916
2103
334

236699
2698409
11007636
18203964
25760694
35730106
37082662
47844589
52437992
58527015
64802521
65066719
70603566
70870535
74673919
80261729
81018462
85144130
89506421
94116568
96664331
98363779
98671579
101712254
103539300
104099126
104503861
105767334
106750068
106915630
112273036
115744973
121051157
123394632
124218782
129277104
129546183
14398
86456
154631
168952
183210
237313
268605
271394
285683
314959
1904697
3810368
10359235
22284371
23070490
24894879
33023998
39049876
39147935
39390683
40644715
52815591
56173869
56369732
56593559
71154324
72568232
73069741
73402968
76572933
76971470
77632410
88318684
100653729
101675670
105456912
107116166
107557728
110769373
116679053
124174538
124468623
30191
62541
84716
102864
110958
131629
196240
216970
320018
332684
7930361
8577245
8664713
14691393
15339689
23389935
23602620
37414787
38396509
41254761
45131816
54400767
59625761
71126328
71323295
83229482
86487763
90284720
96761640
103995649
107760699
108423541
112215886
113674353
11563401

101133992
110034581
117528645
121536777
126411394
70977
75987
78677
80780
82740
183206
205293
225494
247109
279501
289591
318821
323417
5045585
6726607
7253358
12009081
21846282
25351681
34161124
37208382
43903609
44088638
45415880
46721813
53311426
53970407
54119522
65899705
83799073
84244582
94073704
97285127
99115883
100145299
107905568
130402542
46210
46804
59163
130220
177211
186207
187511
273326
285827
324114
1237057
5033391
5308777
18217408
24545311
27214925
28730623
32654684
40964628
40991834
41172946
44859184
57563181
69495169
71328417
75836092
83066602
83308179
83501773
86919217
89996660
90896992
104378116
114152737
119024362
121311699
125276761
127392312
55060
66904
95864
131181
158813
166195
186835
188405
204575
209171
224527
235847
236386
245078
290589
310435
1497514
1658030
3797431
17910967
17984784
19936335
20829613
30576273
30811767
45790897
61206482
61803685
64603058
72009300
78066936
85568314
90321306
127145862
29576
67730
81000
90645
189164
197595
213773
214935
22297

76457103
78240543
91022592
91861189
92564906
92983416
94419268
95127051
98038305
98807048
105344515
121454272
123413743
129259953
11773
29670
97406
127773
133773
177627
210296
264261
288128
298019
2580406
17120627
18485159
23170136
38407729
40308439
43616315
43645967
45833275
49195651
50025191
52817352
57988713
64226016
70002281
71565997
72770043
81137639
87945160
91074470
103740550
107560029
110271758
112780149
114301705
116388950
123872342
14539
16983
76304
137510
165507
169375
173708
259644
277515
294059
312453
19179463
28978765
30695149
30725091
30729131
35569540
41150627
49679459
63210303
63252617
63710100
65381947
71459665
79752787
81042522
81192693
81401873
81737816
98307800
100588254
103978798
111997875
113973531
118551096
120325550
109516
206547
222326
231097
252575
256581
304804
334262
2278738
12138847
21169750
22066317
27883625
29676720
30652072
34052183
43074821
45406802
46893132
51833345
60896285
60966542
66677833
84600120
86978494
87505900
88690264
94325716
95404147
10765

68771249
75585131
78542168
78781060
94949090
97288684
102093938
111771139
120354907
120524312
125309678
128966389
130678775
9543
15475
30172
45349
99050
139637
142867
251430
265587
292021
3945645
5515915
7381638
9829291
9974082
13850906
26266459
29557097
41443647
50957767
73815467
76986398
79538238
85878056
86001230
86975648
95317623
101762942
106513022
110808232
112163086
128123284
185785
187048
229669
239315
239328
258452
273339
1231534
2924299
4416529
5798346
5893451
9978123
12669181
17604953
34344225
37783434
37872285
40369050
40690037
41054019
43188456
48074116
53192166
59155581
64023517
76485976
79324471
82946595
87417982
91343957
93772396
93923383
108255616
121779085
124687561
3423
118211
151427
158731
179639
206766
239364
296570
316866
339282
11574036
18526162
46880215
47786389
48677406
53051743
55921392
57511950
57846483
62058650
68143849
69738746
70143431
70665843
70779589
85732072
99943177
100848667
106341515
106438996
109083977
112838400
119046320
47148
49219
115157
141686


64264583
73943272
78291257
79148435
79258779
82727053
86637920
90269218
91481683
94870591
96298291
97614844
100590632
114060180
128113763
130108924
14156
18645
60023
271819
311219
1019335
1516088
3363402
11136253
15795242
17198090
17224809
22909620
23623153
30328471
38652935
42322032
45599848
47004656
57249593
59725839
63546596
67175951
82592591
87833938
91676719
94831840
97906234
105425239
112354774
116908137
118190437
123500741
125665164
128761918
27124
66999
75223
87467
201454
209987
216799
229855
253395
321283
1797009
17568125
40638754
44739120
48407508
55982140
69253887
71203310
74432219
75619815
85210140
95182567
98357025
106648824
119031102
119598768
119641616
129459169
27339
71126
121301
145107
213590
305496
326736
3883673
10401079
10934602
11484152
14624104
20148786
20842318
21069505
30210862
37134458
58143636
71449444
83273122
84773554
90562518
91756633
91760741
92295640
108275496
109431067
118886900
122845088
16463
23822
115792
169133
185605
202790
283139
290322
330845
74975

In [34]:
mjdShape

(1, 151)

In [None]:
from pyspark.sql.functions import lit, col
id=615

In [39]:
samples

[{'id': 77306,
  'target': 11,
  'meta': array([0.        , 0.        , 0.        , 0.        , 1.        ,
         0.6876    , 0.0165    , 0.031     , 1.        , 0.58155507],
        dtype=float32),
  'specz': 0.71670001745224,
  'band': array([1, 1, 1, 1, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2,
         4, 5, 6, 3, 2, 4, 5, 6, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4,
         5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5,
         6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 1, 1, 1, 3, 2, 4, 5, 6, 3, 2, 4,
         5, 6, 3, 2, 4, 5, 6, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5,
         6, 3, 2, 4, 5, 6, 1, 1, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4,
         5, 6, 1, 1, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 1, 1,
         1, 1, 1, 1, 1, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 1, 3, 2, 4, 5, 6, 3,
         2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4,
         5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 3, 2, 4, 5, 6, 1, 1, 1, 3, 2,
         4,

In [None]:
np.array( (raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect()) )

In [None]:
#train_mjd_data=raw_vectorsDF.select('hist.mjd').filter(col("object_id") == lit(id)).toPandas()
start=time.time()
band=np.array(raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect() , dtype='int32')
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
start=time.time()
band=raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).toPandas()
band=np.array(band)
print(band.dtype)
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
band

In [None]:
#train_mjd_data=raw_vectorsDF.select('hist.mjd').filter(col("object_id") == lit(id)).toPandas()
start=time.time()
band=np.array(raw_vectorsDF.select('hist.passband').filter(col("object_id") == lit(id)).collect().toPandas() , dtype='int32') #.reshape(c,)
print("elapsed {}".format(str(time.time() - start)) )

In [None]:
x=raw_vectorsDF.select('hist.mjd').toPandas()

In [None]:
mjd.shape


In [None]:
print(elapsed_samples)
print(elapsed_samplesC)

In [None]:
for i in range(1, num_models+1):

    samples_train, samples_valid = train_test_split(samples, test_size=valid_size, random_state=42*i)
    len(samples_train)
    
    start_train=time.time()
    elapsed_augment,elapsed_augmentCpu,\
            elapsed_training_Vectors,elapsed_training_VectorsCpu,\
            elapsed_validation_Vectors,elapsed_validation_VectorsCpu, \
            train_x, train_y = \
            train_model(i, samples_train, samples_valid)
    elapsed_train=time.time()-start_train
    print(elapsed_train)
    #break

In [None]:
samples_train, samples_valid = train_test_split(samples, test_size=valid_size, random_state=42*1)

In [None]:
train_x, train_y = get_keras_data(samples_train)

In [None]:
X=train_x
Y=train_y

In [None]:
shape=X['hist'][0].shape
shape

In [None]:
X['meta'][0].shape

In [None]:
hist_input = Input(shape=X['hist'][0].shape, name='hist')
meta_input = Input(shape=X['meta'][0].shape, name='meta')
band_input = Input(shape=X['band'][0].shape, name='band')

In [None]:
band_emb = Embedding(8, 8)(band_input)

In [None]:
hist_input

In [None]:
band_emb

In [None]:
hist = concatenate([hist_input, band_emb])
hist = TimeDistributed(Dense(40, activation='relu'))(hist)


In [None]:
hist

# Stuff below is final vector analysis

In [None]:
raw_vectorsDF.count()

In [None]:
list_persons

#### Converting dataframes to dictionaries
https://stackoverflow.com/questions/41206255/convert-pyspark-sql-dataframe-dataframe-type-dataframe-to-dictionary

In [None]:
list_objects = map(lambda row: row.asDict(), raw_vectorsDF.collect())

In [None]:
object_vectors = {object['object_id']: object for object in list_objects}

In [None]:
samples=[]

In [None]:
use_specz=False

for key in object_vectors.keys():
    print(key)
    i=object_vectors.get(key)
    
    id=i.get('object_id')

    sample = {}
    sample['id'] = int(id)
    
    # 'object_id', 'target', 'meta', 'specz', 'band', 'hist'
    
    sample['target'] = int(i.get('target'))
    
    meta=np.array(i.get('meta'), dtype='float32')
  
    sample['meta'] = np.zeros(10, dtype = 'float32')

    sample['meta'][4] = meta[0]
    sample['meta'][5] = meta[2]
    sample['meta'][6] = meta[3]
    sample['meta'][7] = meta[4]
    sample['meta'][8] = float(meta[2]) > 0

    sample['specz'] = float(meta[1])    
    
    if use_specz:
        sample['meta'][5] = float(meta['hostgal_specz'])
        sample['meta'][6] = 0.0

    z = float(sample['meta'][5])

    j=i.get('hist')
    mjd=np.array(j[0][0], dtype='float32')
    r,c=mjd.shape
    mjd.reshape(c,)
    band=np.array(  j[0][1], dtype='int32').reshape(c,) # passband
    flux=np.array( j[0][2], dtype='float32').reshape(c,) # flux
    flux_err=np.array( j[0][3], dtype='float32').reshape(c,) # flux_err
    detected=np.array( j[0][4], dtype='int32').reshape(c,) # Detected

    mjd -= mjd[0]
    mjd /= 100 # Earth time shift in day*100
    mjd /= (z + 1) # Object time shift in day*100


    received_wavelength = passbands[band] # Earth wavelength in nm
    received_freq = 300000 / received_wavelength # Earth frequency in THz
    source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm


    sample['band'] = band + 1

    sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
    sample['hist'][:,0] = mjd
    sample['hist'][:,1] = flux
    sample['hist'][:,2] = flux_err
    sample['hist'][:,3] = detected

    sample['hist'][:,6] = (source_wavelength/1000)
    sample['hist'][:,7] = (received_wavelength/1000)

    set_intervals(sample)
    
    flux_max = np.max(flux)
    flux_min = np.min(flux)
    flux_pow = math.log2(flux_max - flux_min)
    sample['hist'][:,1] /= math.pow(2, flux_pow)
    sample['hist'][:,2] /= math.pow(2, flux_pow)
    sample['meta'][9] = flux_pow / 10
    
    samples.append(sample)

    if len(samples) % 1000 == 0:
        print('Converting data {0}'.format(len(samples)), end='\r')

    if len(samples) >= limit:
        break
    
  

In [None]:
target

In [None]:
 np.where(classes == int(target))[0][0] # positional index of the classes array

In [None]:
i=object_vectors.get(615)

In [None]:
a[[0,1,3], :][:, [0,2]]  # Selects the columns you want as well

In [None]:
meta=meta=np.array(i.get('meta'), dtype='float32')
meta

In [None]:
meta[4]

In [None]:
j[0][0]

In [None]:
i=dict_persons.get(6266)

In [None]:
dict_persons.values()

In [None]:
i

In [None]:
[*i]

In [None]:
for key in i.keys():
  print(key)

In [None]:
i.get('object_id')

In [None]:
np.array(i.get('band')).shape

In [None]:
j=i.get('hist')

In [None]:
j

In [None]:
for row, sublist in enumerate(j):
    for column, item in enumerate(sublist):
        if item:
            print((row, column))
            


In [None]:
j[0][0] -- mjd

In [None]:
j[0][1] # passband

In [None]:
j[0][2] # flux

In [None]:
j[0][3] # flux_err

In [None]:
j[0][4] # Detected

In [None]:
for row, sublist in enumerate(k):
    for column, item in enumerate(sublist):
        if item:
            print((row, column))
            

In [None]:
k[0][1]

In [None]:

print(train_x['id'].shape)
print(train_x['meta'].shape)
print(train_x['band'].shape)
print(train_x['hist'].shape)

OK, so for hist - each row, we have 8 arrays palanced out to 256 entries - pad stuff! and the arrays are 0 mjd 1 flux 2 flux_err 3 etc 4 etc 5 etc 6 7

so 

- train_x['hist'][0,:,0] is mjd
- train_x['hist'][0,:,1] is flux
- train_x['hist'][0,:,2] is flux_err
- train_x['hist'][0,:,3] is detected
- train_x['hist'][0,:,4] is mjd_deltas
- train_x['hist'][0,:,5] is mjd_reverse_deltas
- train_x['hist'][0,:,6] is source_wavelength
- train_x['hist'][0,:,7] is received_wavelength

Just remember that in get_keras_data, mjd, detected and received_wavelength values are removed and set to zero.

In [None]:
train_x['id'][0]

In [None]:
train_x['hist'][0,:,4]

In [None]:
mjd=[1,2,4,8,16,32,64]

mjd1=np.ediff1d(mjd, to_begin = [0]) - mjd deltas between elements
mjd2=np.ediff1d(mjd, to_end = [0]) - reverse mjd from end

#hist[:,4] = np.ediff1d(hist[:,0], to_begin = [0])
#hist[:,5] = np.ediff1d(hist[:,0], to_end = [0])

In [None]:
fullSetSQL="""
select * from training_set_all_padded
"""

In [None]:
fullDF =sqlContext.sql(fullSetSQL)

In [None]:
fullDF.printSchema()

In [None]:
fullDF.count()

In [None]:
X = {
        'id': np.array(fullDF.select('object_id').collect(), dtype='int32'),
        'meta': np.array(fullDF.select('meta').collect(), dtype='float32'),
        'band': np.array(fullDF.select('band').collect() , dtype='int32').reshape(7848,256)
#        'hist': pad_sequences([i['hist'] for i in itemslist], maxlen=sequence_len, dtype='float32'),
    }
#print('creating Y')
#Y = to_categorical([i['target'] for i in itemslist], num_classes=len(classes))

In [None]:
X['id'].shape

In [None]:
histTest=np.array(fullDF.select('hist.interval').collect(), dtype='float32').reshape(7848,256)

In [None]:
histTest.shape

In [None]:
fluxTest=np.array(fullDF.select('hist.flux').collect(), dtype='float32').reshape(7848,256)
flux_err_test=np.array(fullDF.select('hist.flux_err').collect(), dtype='float32').reshape(7848,256)

In [None]:
flux_err_test.shape

In [None]:
histArray=np.zeros((7848,256,8), dtype='float32') 
# this will work brilliantly as get_keras_data sets three columns to zeros anyway

In [None]:
histArray[:,:,0]=histTest
histArray[:,:,1]=fluxTest
histArray[:,:,1]=flux_err_test

In [None]:
histArray.shape

In [None]:
targetArray=fluxTest=np.array(fullDF.select('target').collect(), dtype='int32')

In [None]:
targetArray.shape

In [None]:
classes

In [None]:
len(classes)

In [None]:
np.where(classes == 6)

In [None]:
Y = to_categorical(3, num_classes=len(classes))

In [None]:
Y

In [None]:
fullDF.select('meta').collect()

# Experimental - testing mods for get data below

In [None]:
extragalactic=None

In [None]:
samples = []
# You have to perform an aggregation on the Spark dataframe and collect the results before you can iterate 
# This i not necessary with pandas
groups = train_mdf_data.groupby('object_id')


In [None]:
for g in groups:
    id=g[0]
    
    sample = {}
    sample['id'] = int(id)
    
    meta = train_meta.loc[train_meta['object_id'] == id]
    
    # NEW We need to get the arrays for each pivoted dataframe
    #mjd = train_mjd_data.loc[train_mjd_data['object_id'] == id]
    #passband=train_passband_data.loc[train_passband_data['object_id'] == id]
    #flux=train_flux_data.loc[train_flux_data['object_id'] == id]
    #flux_err=train_flux_err_data.loc[train_flux_err_data['object_id'] == id]
    #detected=train_detected_data.loc[train_detected_data['object_id'] == id]
    
    if extragalactic == True and float(meta['hostgal_photoz']) == 0:
        continue

    if extragalactic == False and float(meta['hostgal_photoz']) > 0:
        continue    
    
    if 'target' in meta:
        sample['target'] = np.where(classes == int(meta['target']))[0][0]
    else:
        sample['target'] = len(classes) - 1   

    sample['meta'] = np.zeros(10, dtype = 'float32')

    sample['meta'][4] = meta['ddf']
    sample['meta'][5] = meta['hostgal_photoz']
    sample['meta'][6] = meta['hostgal_photoz_err']
    sample['meta'][7] = meta['mwebv']
    sample['meta'][8] = float(meta['hostgal_photoz']) > 0

    sample['specz'] = float(meta['hostgal_specz'])

    
    if use_specz:
        sample['meta'][5] = float(meta['hostgal_specz'])
        sample['meta'][6] = 0.0

    z = float(sample['meta'][5])
    
    # we need to drop the object_id from the pivot records. We can use any of the pivot dataframes,
    # because they all have the same shape, coming from a Hive table definition. We'll use the MJD
    # pivot dataframe to set up the indexes we want. How we do this - in stages
    # 1. Create a data frame for the pivot dataframes, on for each object_id
    # 2. Use dropna to remove NAN column values
    # 3. Cast that dataframe to a numpy array and get the shape
    # 4. Use the mjd array as a base, create an index list of the columns we want - ie we're dropping the object_id
    # 5. Use the index ro truncate the object_id column from the rest of the arrays
    # 6. finally, we need to reshape the arrays from [1;cols] to [cols,]   
    
    mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')
    r,c=mjd.shape

    idx_OUT_columns = [0]
    idx_IN_columns = [i for i in range(np.shape(mjd)[1]) if i not in idx_OUT_columns]

    mjd = mjdArray[:,idx_IN_columns].reshape(c-1,)
    band = np.array(train_passband_data.loc[train_passband_data['object_id'] == id].dropna(axis='columns') , dtype='int32')[:,idx_IN_columns].reshape(c-1,)
    flux = np.array(train_flux_data.loc[train_flux_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    flux_err = np.array(train_flux_err_data.loc[train_flux_err_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    detected = np.array(train_detected_data.loc[train_detected_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
    #mjd      = np.array(g[1]['mjd'],      dtype='float32')
    #band     = np.array(g[1]['passband'], dtype='int32')
    #flux     = np.array(g[1]['flux'],     dtype='float32')
    #flux_err = np.array(g[1]['flux_err'], dtype='float32')
    #detected = np.array(g[1]['detected'], dtype='float32')  

    
    mjd -= mjd[0]
    mjd /= 100 # Earth time shift in day*100
    mjd /= (z + 1) # Object time shift in day*100

    
    received_wavelength = passbands[band] # Earth wavelength in nm
    received_freq = 300000 / received_wavelength # Earth frequency in THz
    source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm

    
    sample['band'] = band + 1

    sample['hist'] = np.zeros((flux.shape[0], 8), dtype='float32')
    sample['hist'][:,0] = mjd
    sample['hist'][:,1] = flux
    sample['hist'][:,2] = flux_err
    sample['hist'][:,3] = detected

    sample['hist'][:,6] = (source_wavelength/1000)
    sample['hist'][:,7] = (received_wavelength/1000)
    
    set_intervals(sample)


    flux_max = np.max(flux)
    flux_min = np.min(flux)
    flux_pow = math.log2(flux_max - flux_min)
    sample['hist'][:,1] /= math.pow(2, flux_pow)
    sample['hist'][:,2] /= math.pow(2, flux_pow)
    sample['meta'][9] = flux_pow / 10
    
    samples.append(sample)

    if len(samples) % 1000 == 0:
        print('Converting data {0}'.format(len(samples)), end='\r')

    if len(samples) >= limit:
        break


In [None]:
sample

In [None]:
id=713
print(id)
sample = {}
sample['id'] = int(id)

meta = train_meta.loc[train_meta['object_id'] == id]

# NEW We need to get the arrays for each pivoted dataframe
#mjd = train_mjd_data.loc[train_mjd_data['object_id'] == id]
#passband=train_passband_data.loc[train_passband_data['object_id'] == id]
#flux=train_flux_data.loc[train_flux_data['object_id'] == id]
#flux_err=train_flux_err_data.loc[train_flux_err_data['object_id'] == id]
#detected=train_detected_data.loc[train_detected_data['object_id'] == id]

if extragalactic == True and float(meta['hostgal_photoz']) == 0:
    print('Hi there')

if extragalactic == False and float(meta['hostgal_photoz']) > 0:
    print('Hi there again')  

if 'target' in meta:
    sample['target'] = np.where(classes == int(meta['target']))[0][0]
else:
    sample['target'] = len(classes) - 1   

sample['meta'] = np.zeros(10, dtype = 'float32')

sample['meta'][4] = meta['ddf']
sample['meta'][5] = meta['hostgal_photoz']
sample['meta'][6] = meta['hostgal_photoz_err']
sample['meta'][7] = meta['mwebv']
sample['meta'][8] = float(meta['hostgal_photoz']) > 0

sample['specz'] = float(meta['hostgal_specz'])


if use_specz:
    sample['meta'][5] = float(meta['hostgal_specz'])
    sample['meta'][6] = 0.0

z = float(sample['meta'][5])

# we need to drop the object_id from the pivot records. We can use any of the pivot dataframes,
# because they all have the same shape, coming from a Hive table definition. We'll use the MJD
# pivot dataframe to set up the indexes we want. How we do this - in stages
# 1. Create a data frame for the pivot dataframes, on for each object_id
# 2. Use dropna to remove NAN column values
# 3. Cast that dataframe to a numpy array and get the shape
# 4. Use the mjd array as a base, create an index list of the columns we want - ie we're dropping the object_id
# 5. Use the index ro truncate the object_id column from the rest of the arrays
# 6. finally, we need to reshape the arrays from [1;cols] to [cols,],
mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')
r,c=mjd.shape

idx_OUT_columns = [0]
idx_IN_columns = [i for i in range(np.shape(mjd)[1]) if i not in idx_OUT_columns]

mjd = mjdArray[:,idx_IN_columns].reshape(c-1,)
band = np.array(train_passband_data.loc[train_passband_data['object_id'] == id].dropna(axis='columns') , dtype='int32')[:,idx_IN_columns].reshape(c-1,)
flux = np.array(train_flux_data.loc[train_flux_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
flux_err = np.array(train_flux_err_data.loc[train_flux_err_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)
detected = np.array(train_detected_data.loc[train_detected_data['object_id'] == id].dropna(axis='columns') , dtype='float32')[:,idx_IN_columns].reshape(c-1,)

#mjd      = np.array(g[1]['mjd'],      dtype='float32')
#band     = np.array(g[1]['passband'], dtype='int32')
#flux     = np.array(g[1]['flux'],     dtype='float32')
#flux_err = np.array(g[1]['flux_err'], dtype='float32')
#detected = np.array(g[1]['detected'], dtype='float32')  

# Now we need to reshape to columns
#mjd=mjd.reshape(352,)

mjd -= mjd[0]
mjd /= 100 # Earth time shift in day*100
mjd /= (z + 1) # Object time shift in day*100




In [None]:

received_wavelength = passbands[band] # Earth wavelength in nm
received_freq = 300000 / received_wavelength # Earth frequency in THz
source_wavelength = received_wavelength / (z + 1) # Object wavelength in nm

In [None]:
hiThere = train_mjd_data.loc[train_mjd_data['object_id'] == id]

In [None]:
train_mjd_data.

In [None]:
hiThere

In [None]:
hiThere.dropna(axis='columns')

In [None]:
mjd = np.array(train_mjd_data.loc[train_mjd_data['object_id'] == id].dropna(axis='columns'), dtype='float32')

In [None]:
r,c=mjd.shape

In [None]:
r

In [None]:
c