# Create the training set basic vector dataframe

## Second model
* Unpadded, 
* No calculations
* Metadata table is not incorporated



### Optional - a customised Spark Context

Next cell is included as an example on how to customise the Spark Context within a Jupyter notebook.

In [1]:
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
import os
import os.path as osp
#import commands
import time
import random

import numpy as np

import numpy as np
from pyspark import SparkConf,SparkContext, StorageLevel
from pyspark.sql import Row, SQLContext, SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors


from datetime import datetime
LogFile=datetime.now().strftime('Create_vectors_%H_%M_%d_%m_%Y.log')

import logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(LogFile)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)

### Set up the SQL context

In [3]:
sqlContext = SQLContext(sc)

### Set the default database

In [4]:
sqlContext.sql("use plasticc")

DataFrame[]

### And finally, we create the full raw training set of feature vectors, unpadded, no calculations

Next cell illustrates the original structured record with hist as an array of arrays.

#### Note that there is no cast or collect_list to array (array (collect_list(...) )
This changes the shape of the retrieved aray in the programs

In [5]:
getVectorsSql="""
select object_id,
array( collect_list(float(a.kv['mjd'])) ) as mjd,
array( collect_list(int(a.kv['passband'])) ) as passband,
array( collect_list( float(  a.kv['flux'] )  ) ) as flux,
array( collect_list( float(  a.kv['flux_err']  ) ) ) as flux_err,
array( collect_list(int(   a.kv['detected']   )) ) as detected
from
(
select object_id,
   MAP('mjd', mjd,'passband',passband,'flux',flux,'flux_err',flux_err,'detected',detected) AS kv
from training_set

) a
group by object_id
"""

vectors_df=sqlContext.sql(getVectorsSql)

#### Display the schema for the feature vectors

In [6]:
vectors_df.printSchema()

root
 |-- object_id: integer (nullable = true)
 |-- mjd: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- passband: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)
 |-- flux: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- flux_err: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- detected: array (nullable = false)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: integer (containsNull = true)



In [7]:
vectors_df.explain()

== Physical Plan ==
ObjectHashAggregate(keys=[object_id#6], functions=[collect_list(cast(kv#0[mjd] as float), 0, 0), collect_list(cast(kv#0[passband] as int), 0, 0), collect_list(cast(kv#0[flux] as float), 0, 0), collect_list(cast(kv#0[flux_err] as float), 0, 0), collect_list(cast(kv#0[detected] as int), 0, 0)])
+- Exchange hashpartitioning(object_id#6, 200)
   +- ObjectHashAggregate(keys=[object_id#6], functions=[partial_collect_list(cast(kv#0[mjd] as float), 0, 0), partial_collect_list(cast(kv#0[passband] as int), 0, 0), partial_collect_list(cast(kv#0[flux] as float), 0, 0), partial_collect_list(cast(kv#0[flux_err] as float), 0, 0), partial_collect_list(cast(kv#0[detected] as int), 0, 0)])
      +- *(1) Project [object_id#6, map(mjd, mjd#7, passband, cast(passband#8 as double), flux, flux#9, flux_err, flux_err#10, detected, cast(detected#11 as double)) AS kv#0]
         +- HiveTableScan [detected#11, flux#9, flux_err#10, mjd#7, object_id#6, passband#8], HiveTableRelation `plasticc`.`

## And finally, we create the vector table in hive

Modes are insert or append.

In [8]:
MODE='append'
FORMAT='parquet'
TABLE='full_training_pivot'

vectors_df.write.mode(MODE).format(FORMAT).saveAsTable(TABLE)


### Explain plan comparison

This next cell demonstrates the explain plan when we read the prepared data from the instantiated table

In [10]:
pivotDF=sqlContext.sql("""select * from full_training_pivot""")
pivotDF.explain()

== Physical Plan ==
*(1) FileScan parquet plasticc.full_training_pivot[object_id#39,mjd#40,passband#41,flux#42,flux_err#43,detected#44] Batched: false, Format: Parquet, Location: InMemoryFileIndex[hdfs://athena-1.nimbus.pawsey.org.au:8020/user/hive/warehouse/plasticc.db/full_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<object_id:int,mjd:array<array<float>>,passband:array<array<int>>,flux:array<array<float>>,...
