In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars ../lib/tools/dataprofiler-tools-current.jar pyspark-shell'

import sys
sys.path.extend(['/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip', '/usr/hdp/current/spark2-client/python'])

import json
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import lit

In [None]:
# Create a SparkConf with necessary serializers
conf = [('spark.serializer','org.apache.spark.serializer.KryoSerializer'),
       ('spark.sql.orc.enabled','true'),
       ('spark.kryo.classesToRegister','org.apache.accumulo.core.data.Value,org.apache.accumulo.core.data.Key')]

sparkConf = pyspark.SparkConf()
sparkConf.setAll(conf)

# Create the Spark context
sc = pyspark.SparkContext(appName='dpspark', conf=sparkConf)
spark = SparkSession(sc)

In [None]:
# Create a DPSparkContext object
dpSparkContext = sc._jvm.com.dataprofiler.DPSparkContext(sc._jsc)

In [None]:
# Read in soem data
data_scan = {"type": "row",
             "dataset": "police-department-calls-for-service",
             "table": "policedepartmentcallsforservice",
            }

rowsRdd = sc.newAPIHadoopRDD("com.dataprofiler.RowInputFormat",
                         "java.lang.String",
                         "java.lang.String",
                             conf={"DataProfiler.dataScanSpec": json.dumps(data_scan)})

In [None]:
# Convert to a dataframe
def rowsFlat(t):
    return pyspark.sql.Row(**json.loads(t[1]))

rowsDf = rowsRdd.map(rowsFlat).toDF()

rowsDf.printSchema()

%time print(rowsDf.count())
rowsDf.show()

In [None]:
# Perform some SQL
rowsDf.createOrReplaceTempView("calls")
filtered = spark.sql("SELECT Disposition,`Offense Date`,`Original Crime Type Name`,City from calls where (`City`='Yerba Buena') AND (`Disposition`='NOM' OR `Disposition`='ADV')")
%time print(filtered.count())
filtered.show()

In [None]:
sc._jvm.com.dataprofiler.TableLoader.loadFromPyspark(dpSparkContext,
                                                           filtered._jdf,
                                                           "zach_test",
                                                           "pyspark_sample",
                                                           False,
                                                           "LIST.Public_Data",
                                                           10000,
                                                           False)