### Starting Spark Session

In [None]:
%reload_ext sparkmagic.magics
%manage_spark

Tab(children=(ManageSessionWidget(children=(HTML(value='<br/>'), HTML(value='No sessions yet.'))), CreateSessi…

### Importing the Libraries

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType
from pyspark.sql.functions import reverse, split, input_file_name,expr,date_format, current_date, col, year

In [1]:
server_name ="jdbc:mysql://hostname:port"
database_name = "database_name"
jdbcurl = server_name + "/" + database_name
table_name = "table_name"
db_properties = {"user":"username", "password":"password"}
source_path = "file:///mounts/shared-volume/user/fsi/data"   
sink = "file:///mounts/shared-volume/user/fsi/data/sink"
checkpoint = "file:///mounts/shared-volume/user/fsi/data/checkpoint"

### Schema definition of the incoming files 

In [2]:
def getSchema():
   
    schema = StructType([
        StructField("Name", StringType(), True),
        StructField("Symbol", StringType(), True),
        StructField("Date", DateType(), True),
        StructField("Open", DoubleType(), True),
        StructField("Close", DoubleType(), True),
        StructField("Series", StringType(), True),
        StructField("Volume", DoubleType(), True),
        StructField("Turnover", DoubleType(), True)
    ])
    return schema

### Reading the incoming files in batches 

In [3]:
def streamStockData():
    
    df = spark.readStream \
        .option("maxFilesPerTrigger", 4) \
        .option("header", True) \
        .schema(getSchema()) \
        .csv(source_path) \
        .withColumn("Year", year(col("Date")))
    
    final_df = df.select("Name","Symbol","Date", "Open", "Close","Volume","Turnover","Year")
    return final_df

### Write data to the directory 

In [4]:
def writeDataStream(final_df):
    
    final_df.writeStream \
        .outputMode("append") \
        .trigger(processingTime = "1 minute") \
        .format("csv")\
        .option("path", sink)\
        .option("header", True) \
        .option("checkpointLocation", checkpoint) \
        .start() \
        .awaitTermination(timeout = 300)

### Write the data to the database in batches 

In [5]:
def write_to_mysql(df, epoch_id):
    dfwriter = df.write.mode("append") 
    #dfwriter.jdbc(url=jdbcurl, table=table_name, properties=db_properties).save() # if this is not working use below
    df.write.jdbc(url=jdbcurl, table=table_name, properties=db_properties, mode="append")
    pass

def writeToDatabase(final_df):
   
    query = final_df.writeStream \
        .outputMode("append") \
        .option("checkpointLocation", checkpoint) \
        .foreachBatch(write_to_mysql) \
        .start() \
        .awaitTermination(1000)

In [None]:
final_df = streamStockData()
#writeDataStream(final_df)
writeToDatabase(final_df)