#### Creating Overall Stats per Monitor

**Import Libraries and Data**

In [0]:
from pyspark.sql.functions import col, count, mean, min, max, stddev, round, percentile

silverConditions = spark.read.table("gasmonitordata.silver.monitorh2sandconditions")
silverInfo = spark.read.table("gasmonitordata.silver.monitorinformation")

**Making Table**

In [0]:
overallStats = silverConditions.alias('c').join(silverInfo.alias('i'), silverConditions.monitorNum == silverInfo.monitorNum). \
    groupBy('c.monitorNum', 'i.groupNum'). \
    agg(
        count(col('c.h2sReading')).alias('numReadings'),
        round(mean(col('c.h2sReading')), 2).alias('meanH2S'),
        round(min(col('c.h2sReading')), 2).alias('minH2S'),
        round(percentile(col('c.h2sReading'), 0.25), 2).alias('p25_H2S'),
        round(percentile(col('c.h2sReading'), 0.50), 2).alias('p50_H2S'),
        round(percentile(col('c.h2sReading'), 0.75), 2).alias('p75_H2S'),
        round(max(col('c.h2sReading')), 2).alias('maxH2S'),
        round(stddev(col('c.h2sReading')), 2).alias('stddevH2S')
    ). \
    orderBy('c.monitorNum', 'i.groupNum'). \
    select('c.monitorNum', 'i.groupNum', 'numReadings', 'meanH2S', 'minH2S', 'p25_H2S', 'p50_H2S', 'p75_H2S', 'maxH2S', 'stddevH2S')

In [0]:
overallStats.write.mode('overwrite').saveAsTable('gasmonitordata.gold.overallStats')