##  **Sentiment Analysis with SynapseML with Incremental Loading.**

In [20]:
df = spark.sql("SELECT * FROM bing_olympic_news_db.olympic_news_updated_data LIMIT 1000")
display(df)

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 22, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5dc03c9d-950c-4b9b-8fb6-43ac91265ee5)

In [21]:
#import required packages
import synapse.ml.core
from synapse.ml.services import AnalyzeText

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 23, Finished, Available, Finished)

In [22]:
#initialize the model and configure the input and output columns
model = (AnalyzeText()
        .setTextCol("description") ## set the column we want to perform sentiments on
        .setKind("SentimentAnalysis") ## specifying the sentiment analysis model to be performed.
        .setOutputCol("response")
        .setErrorCol("error")) 

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 24, Finished, Available, Finished)

In [23]:

#Apply the model to our dataframe
result = model.transform(df)

display(result.limit(10))

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 25, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f36778b7-ad9a-4d30-8a86-ff68d55f59b7)

In [24]:
#To get the Sentiment Column from the response column
from pyspark.sql.functions import col

sentiment_df = result.withColumn("sentiment", col("response.documents.sentiment"))
display(sentiment_df.limit(7))

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 26, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 85b97e27-1be8-42eb-9c96-8cce557345d1)

In [25]:
#Droping the error and response columns

sentiment_df_final = sentiment_df.drop("error","response")
display(sentiment_df_final.limit(10))

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 27, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 327bb227-bfc7-40f3-81d2-2bcee0771720)

## **Implementing the Type 1 incremental Loading of the Sentiment Data**

In [26]:
# Adopting TYPE 1 SCD incremental loading for our data.

'''In a Type 1 SCD the new data overwrites the existing data without duplicate. Thus the existing data
 is lost as it is not stored anywhere else. This is typically used when there is no need to keep 
 a history of the data.'''

from pyspark.sql.utils import AnalysisException

#Exception Handling
try:

    table_name = "bing_olympic_news_db.sentiment_analysis"
    sentiment_df_final.write.format("delta").saveAsTable(table_name)

except AnalysisException:

    print ("Table Already Exist")

    sentiment_df.createOrReplaceTempView("vw_sentiment")

    spark.sql(f"""  MERGE INTO {table_name} target_table
                    USING vw_sentiment source_view

                    ON source_view.link = target_table.link

                    WHEN MATCHED AND
                    source_view.title <> target_table.title OR
                    source_view.description <> target_table.description OR
                    source_view.image <> target_table.image OR
                    source_view.link <> target_table.link OR
                    source_view.datePublished <> target_table.datePublished OR
                    source_view.provider <> target_table.provider OR
                    source_view.published_date <> target_table.published_date OR
                    source_view.published_time<> target_table.published_time
                    
                    THEN UPDATE SET *

                    WHEN NOT MATCHED THEN INSERT * 

                """)

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 28, Finished, Available, Finished)

In [27]:
%%sql

SELECT COUNT(*) FROM bing_olympic_news_db.sentiment_analysis
     

StatementMeta(, 22446195-e871-450d-b365-506a389e4151, 29, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>