In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import *
from config import SOURCE_TOPIC, SERVER_PORT, DATADIR

spark = SparkSession \
          .builder \
          .master("local[6]") \
          .appName("Kafka to mongodb") \
          .getOrCreate()

df = spark\
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", SERVER_PORT) \
      .option("subscribe", SOURCE_TOPIC) \
      .option("startingOffsets", "latest") \
      .option('failOnDataLoss','false') \
      .load() \
      .select('value','timestamp')

spark.sparkContext.setLogLevel('ERROR')

For this part, defining schema is a MUST. In order to explode a column of json to multiple columns, pyspark needs to know the schema. If a datatype doesn't match, pyspark will resort to returning nulls.

I don't need to say that I didn't type that by hand. We have ✨ vim ✨ magic.

In [None]:

schema = StructType([
    StructField('Event',StringType() , False),
    StructField('Stream_Time', DoubleType() , False),
    StructField('ID', IntegerType() , False),
    StructField('Severity',ByteType() , False),
    StructField('Start_Time',TimestampType() , False),
    StructField('End_Time',TimestampType() , False),
    StructField('Start_Lat',DoubleType() , False),
    StructField('Start_Lng',DoubleType() , False),
    StructField('End_Lat',DoubleType() , False),
    StructField('End_Lng',DoubleType() , False),
    StructField('Distance_mi',DoubleType() , False),
    StructField('Description',StringType() , False),
    StructField('Number',StringType() , False),
    StructField('Street',StringType() , False),
    StructField('Side',StringType() , False),
    StructField('City',StringType() , False),
    StructField('County',StringType() , False),
    StructField('State',StringType() , False),
    StructField('Zipcode',StringType() , False),
    StructField('Country',StringType() , False),
    StructField('Timezone',StringType() , False),
    StructField('Airport_Code',StringType() , False),
    StructField('Weather_Timestamp',TimestampType() , False),
    StructField('Temperature_F',DoubleType() , False),
    StructField('Wind_Chill_F',DoubleType() , False),
    StructField('Humidity_Percent',DoubleType() , False),
    StructField('Pressure_in',DoubleType() , False),
    StructField('Visibility_mi',DoubleType() , False),
    StructField('Wind_Direction',StringType() , False),
    StructField('Wind_Speed_mph',DoubleType() , False),
    StructField('Precipitation_in',DoubleType() , False),
    StructField('Weather_Condition',StringType() , False),
    StructField('Amenity',BooleanType() , False),
    StructField('Bump',BooleanType() , False),
    StructField('Crossing',BooleanType() , False),
    StructField('Give_Way',BooleanType() , False),
    StructField('Junction',BooleanType() , False),
    StructField('No_Exit',BooleanType() , False),
    StructField('Railway',BooleanType() , False),
    StructField('Roundabout',BooleanType() , False),
    StructField('Station',BooleanType() , False),
    StructField('Stop',BooleanType() , False),
    StructField('Traffic_Calming',BooleanType() , False),
    StructField('Traffic_Signal',BooleanType() , False),
    StructField('Turning_Loop',BooleanType() , False),
    StructField('Sunrise_Sunset',StringType() , False),
    StructField('Civil_Twilight',StringType() , False),
    StructField('Nautical_Twilight',StringType() , False),
    StructField('Astronomical_Twilight',StringType(), False)
    ])


remember that the data is dumped to string, so first cast as string and then read with `from_json`, then select the columns you need. For this one we need all the columns.

For the mongo connection we define the host and port, the database, the collection (equivalent to table in relational databases). We don't bother creating the collection, mongo creates it when it sees us using it. checkpointLocation is a MUST.

As always, this is a streaming script, therefore it needs to be run from terminal and as `.py`.

The terminal command uses an extra package:
`spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.mongodb.spark:mongo-spark-connector:10.0.2 kafkamongo.py`

Again, you'll have to google the compatible package version with your environment.

In [None]:

df = df \
    .withColumn('json', df.value.cast(StringType())) \
    .withColumn('jsonData', from_json(col('json'), schema)) \
    .drop('json','value')

new_df = df \
        .select('jsonData.*')

query = new_df \
    .writeStream \
    .outputMode('append') \
    .format('mongodb') \
    .option('spark.mongodb.connection.uri','mongodb://localhost:27017') \
    .option('spark.mongodb.database','mongodb') \
    .option('spark.mongodb.collection','accidents') \
    .option('checkpointLocation', DATADIR + 'temp/') \
    .start()\

query.awaitTermination()

While this works, mongo write performance prefers fault-tolerance over speed, as mentioned [here](https://www.mongodb.com/docs/v4.2/core/write-performance/). Also, this is a free version of mongo, so I don't expect the optimal performance. 

Was considering using cassandra, maybe I'll revisit the project and try cassandra performance. But for now, we go back to postgreSQL.