In [1]:
import sys
sys.path.append("./work/imcp")

from utils.configuration import get_settings
from operators.streaming import SparkStreaming
import pyspark.sql.functions as F

# Create Spark Session by user defined class

In [2]:
settings = get_settings()
spark = SparkStreaming.get_instance(app_name="Spark Streaming")

In [3]:
spark

# Check the Spark MongoDB Connector

## Read batching data by Spark DataFrame

In [3]:
df = spark.read.format("mongodb") \
            .option("spark.mongodb.read.connection.uri", settings.MONGODB_ATLAS_URI) \
            .option("spark.mongodb.read.database", "imcp") \
            .option("spark.mongodb.read.collection", "raw") \
            .load()

In [4]:
df = df.withColumn("created_time", F.to_timestamp(F.col("created_time"), "yyyy-MM-dd HH:mm:ss")) \
        .withColumn("caption_size", F.size(F.split(df["caption"], " "))) \
        .drop("_id")

processed_df = df.filter(df["caption_size"] < 50)

print(processed_df.printSchema())
print("The number of filterd rows: ", processed_df.count())

root
 |-- caption: string (nullable = true)
 |-- created_time: timestamp (nullable = true)
 |-- howpublished: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- short_caption: string (nullable = true)
 |-- url: string (nullable = true)
 |-- caption_size: integer (nullable = false)

None
The number of filterd rows:  68721


## Read batching data by Spark SQL

In [None]:
df.createOrReplaceTempView("vw_raw")

In [None]:
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   vw_raw|      false|
+---------+---------+-----------+



In [15]:
processed = spark.sql("SELECT * FROM vw_raw WHERE caption_size<50")
processed.count()

68721

## Write batching data into MongoDB

In [7]:
processed_df.write.format("mongodb") \
            .option("spark.mongodb.write.connection.uri", settings.MONGODB_ATLAS_URI) \
            .option("spark.mongodb.write.database", "imcp") \
            .option("spark.mongodb.write.collection", "audit") \
            .option("spark.mongodb.write.batch.size", "10000") \
            .mode("append") \
            .save()

In [12]:
spark.stop()