The first part is the exact same as the last notebook.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import *
from config import SOURCE_TOPIC, SERVER_PORT, DATADIR

spark = SparkSession \
          .builder \
          .master("local[6]") \
          .appName("Kafka to postgres") \
          .getOrCreate()

df = spark\
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", SERVER_PORT) \
      .option("subscribe", SOURCE_TOPIC) \
      .option("startingOffsets", "latest") \
      .option('failOnDataLoss','false') \
      .load() \
      .select('value','timestamp')

spark.sparkContext.setLogLevel('ERROR')

schema = StructType([
    StructField('Event',StringType() , False),
    StructField('Stream_Time', DoubleType() , False),
    StructField('ID', IntegerType() , False),
    StructField('Severity',ByteType() , False),
    StructField('Start_Time',TimestampType() , False),
    StructField('End_Time',TimestampType() , False),
    StructField('Start_Lat',DoubleType() , False),
    StructField('Start_Lng',DoubleType() , False),
    StructField('End_Lat',DoubleType() , False),
    StructField('End_Lng',DoubleType() , False),
    StructField('Distance_mi',DoubleType() , False),
    StructField('Description',StringType() , False),
    StructField('Number',StringType() , False),
    StructField('Street',StringType() , False),
    StructField('Side',StringType() , False),
    StructField('City',StringType() , False),
    StructField('County',StringType() , False),
    StructField('State',StringType() , False),
    StructField('Zipcode',StringType() , False),
    StructField('Country',StringType() , False),
    StructField('Timezone',StringType() , False),
    StructField('Airport_Code',StringType() , False),
    StructField('Weather_Timestamp',TimestampType() , False),
    StructField('Temperature_F',DoubleType() , False),
    StructField('Wind_Chill_F',DoubleType() , False),
    StructField('Humidity_Percent',DoubleType() , False),
    StructField('Pressure_in',DoubleType() , False),
    StructField('Visibility_mi',DoubleType() , False),
    StructField('Wind_Direction',StringType() , False),
    StructField('Wind_Speed_mph',DoubleType() , False),
    StructField('Precipitation_in',DoubleType() , False),
    StructField('Weather_Condition',StringType() , False),
    StructField('Amenity',BooleanType() , False),
    StructField('Bump',BooleanType() , False),
    StructField('Crossing',BooleanType() , False),
    StructField('Give_Way',BooleanType() , False),
    StructField('Junction',BooleanType() , False),
    StructField('No_Exit',BooleanType() , False),
    StructField('Railway',BooleanType() , False),
    StructField('Roundabout',BooleanType() , False),
    StructField('Station',BooleanType() , False),
    StructField('Stop',BooleanType() , False),
    StructField('Traffic_Calming',BooleanType() , False),
    StructField('Traffic_Signal',BooleanType() , False),
    StructField('Turning_Loop',BooleanType() , False),
    StructField('Sunrise_Sunset',StringType() , False),
    StructField('Civil_Twilight',StringType() , False),
    StructField('Nautical_Twilight',StringType() , False),
    StructField('Astronomical_Twilight',StringType(), False)
    ])

df = df \
    .withColumn('json', df.value.cast(StringType())) \
    .withColumn('jsonData', from_json(col('json'), schema)) \
    .drop('json','value')

new_df = df \
        .select('jsonData.*')

Pyspark-postgreSQL jar don't support streaming. However, pyspark supports passing functions to `foreachBatch` method (there is also `foreach` method). 

To run the `.py` from the terminal, we change the command a bit. We instead use:
`spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 --driver-class-path Data/postgresql-42.4.1.jar kafka_to_psql.py`

In [None]:
def send(df, batchId):
    df.write.format('jdbc') \
        .option("url", "jdbc:postgresql://localhost:5432/stream_df") \
        .option("dbtable", "df") \
        .option("user", "user") \
        .option("password", "root") \
        .option("driver", "org.postgresql.Driver") \
        .mode('append') \
        .save()
    pass

query = new_df \
        .writeStream \
        .outputMode('append') \
        .foreachBatch(send) \
        .start()

query.awaitTermination()

This works flawlessly; however, you will notice that the database has no schema. This isn't even a star schema warehouse! I'm still working on the Star Schema design and how to write to it with Pyspark. Once I finish it I'll be updating the main repo and this repo. 

Now, we install superset, read the data, and create some charts.