### Initialize Spark and Sonar Cassandra Session

In [None]:
from sonar_driver.spark import analytics as analytics

import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-1.8.0/'
os.environ['SPARK_HOME'] = '/g/g13/wang109/spark-2.3.1-bin-hadoop2.7'
os.environ['JAVA_OPTS'] = '-Djavax.net.ssl.trustStore=/etc/pki/ca-trust/extracted/java/cacerts'
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--master local[*] '
    '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0 '
    'pyspark-shell'
)

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType
from pyspark.sql.functions import col, lit, split, udf, explode

import findspark
findspark.init()

from sonar_auth.cassandra import SonarCassandraSession
session = SonarCassandraSession(['rzsonar8'])

spark = (
    SparkSession.builder
        .appName('cassandra')
        .config('spark.cassandra.connection.host', session.hosts_string)
        .config('spark.cassandra.auth.username', session.username)
        .config('spark.cassandra.auth.password', session.token)
        .getOrCreate()
)

### Read job data from Cassandra and store in Spark dataframe with appropriate column types

In [None]:
sparkdf = (
    spark.read.format('org.apache.spark.sql.cassandra')
        .options(keyspace='lcstaff_k', table='jobdata')
        .load()
        .select(['JobId', 'Cluster', 'StartTime', 'scontrol'])
        .withColumn('JobId', col('JobId').cast(IntegerType()))
        .withColumn('StartTime', col('StartTime').cast(TimestampType()))
        .withColumn('EndTime', col('scontrol')['EndTime'].cast(TimestampType()))
        .drop('scontrol')
)

sparkdf.show()

### Query jobs within a time range and on certain clusters

In [None]:
time_range = ['2018-05-16T07:27:21', '2018-05-17T07:27:21', 'EndTime']
clusters = ['rzgenie', 'rztopaz']

queried_sparkdf = analytics.query(sparkdf, time_range=time_range, clusters=clusters)
queried_sparkdf.show()

### Calculate discrete derivatives based on window size and slide length

In [None]:
analytics.discrete_derivatives(queried_sparkdf, 'EndTime', window_size=300, slide_length=300).show()

### Calculate discrete integrals based on slide length

In [None]:
analytics.discrete_integrals(queried_sparkdf, slide_length=10).show()

### Original dataframe is unaltered

In [None]:
sparkdf.show()