### 1. Loda the data in Cosmos DB Analytical store collection 


In [None]:
df_IoTSignals = spark.read\
                    .format("cosmos.olap")\
                    .option("spark.synapse.linkedService", "CosmosDemoIoT")\
                    .option("spark.cosmos.container", "IoTSignals")\
                    .load()

### 2. Data exploration using pyplot


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df_IoTSignals_pd = df_IoTSignals.toPandas()
df_dev = df_IoTSignals_pd[(df_IoTSignals_pd.deviceId == "dev-1")]
df_dev = df_dev.dropna()
df_dev = df_dev.astype({"measureValue": int})
#display(df_dev)
df_dev = df_dev.pivot(index='dateTime', columns = 'unitSymbol' , values =  'measureValue')
df_dev['timestamp']=df_dev.index
df_dev['index']=list(range(len(df_dev)))
df_dev.set_index('index',inplace=True)
df_dev.plot(y='MW', x= 'timestamp', color='green',figsize=(20,5), label = 'Output MW')
plt.title('MW TimeSeries')
df_dev.plot(y='RPM', x= 'timestamp', color='black', figsize=(20,5), label = 'RPM')
plt.title('RPM TimeSeries')
plt.legend(loc = 'best')
plt.show()

### 3. Perform anomaly detection using Microsoft Machine Learning for Spark (MMLSpark)


In [None]:
from pyspark.sql.functions import col
from mmlspark.cognitive import SimpleDetectAnomalies
from mmlspark.core.spark import FluentAPI

anomaly_detector = (SimpleDetectAnomalies()
                            .setSubscriptionKey("7a20deaa1907472f96b767cf14d0f73d")
                            .setUrl("<Azure Anomaly Detector End Point>anomalydetector/v1.0/timeseries/entire/detect")
                            .setOutputCol("anomalies")
                            .setGroupbyCol("grouping")
                            .setSensitivity(95)
                            .setGranularity("secondly"))

df_anomaly = (df_IoTSignals
                    .where(col("unitSymbol") == 'RPM')
                    .withColumnRenamed("dateTime", "timestamp")
                    .withColumn("value", col("measureValue").cast("double"))
                    .withColumn("grouping", col("deviceId"))
                    .mlTransform(anomaly_detector))

df_anomaly.createOrReplaceTempView('df_anomaly')

In [None]:
display(df_anomaly)

### 4. Format the dataframe for visualization


In [None]:
df_anomaly_single_device = spark.sql("select timestamp \
                                            , measureValue \
                                            , anomalies.expectedValue \
                                            , anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue \
                                            , anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue \
                                            , case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly \
                                        from df_anomaly \
                                        where deviceid = 'dev-1' and timestamp < '2020-12-29'\
                                        order by timestamp \
                                        limit 200")

display(df_anomaly_single_device)  

### 5. Visualize the anomalies using plotly
* Plot Expected value, Upper Value, Lower Value and Actual Value along with Anomaly flag


In [None]:
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import plot
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from matplotlib.pyplot import figure
 
adf = df_anomaly_single_device.toPandas()
adf_subset = df_anomaly_single_device.where(col("isAnomaly") == 1).toPandas() 

plt.figure(figsize=(23,8))
plt.plot(adf['timestamp'],adf['expectedUpperValue'], color='darkred', linestyle='solid', linewidth=0.25)
plt.plot(adf['timestamp'],adf['expectedValue'], color='darkgreen', linestyle='solid', linewidth=2)
plt.plot(adf['timestamp'],adf['measureValue'], 'b', color='royalblue', linestyle='dotted', linewidth=2)
plt.plot(adf['timestamp'],adf['expectedLowerValue'],  color='black', linestyle='solid', linewidth=0.25)
plt.plot(adf_subset['timestamp'],adf_subset['measureValue'], 'ro')
plt.legend(['RPM-UpperMargin', 'RPM-ExpectedValue', 'RPM-ActualValue', 'RPM-LowerMargin', 'RPM-Anomaly'])
plt.title('RPM Anomalies with Expected, Actual, Upper and Lower Values')
plt.show()