In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/snpaya2@illinois.edu/SofiaSensorDataCleaned-1.csv", inferSchema =True
)

In [0]:
df.show(10)

+---------+-------------------+--------+------------------+------------------+------------------+------------------+--------+
|sensor_id|               date|location|               lat|               lon|              PM10|             PM2.5|district|
+---------+-------------------+--------+------------------+------------------+------------------+------------------+--------+
|      739|2018-09-21 00:00:00|   354.0| 42.69400000000015|23.336999999999993| 8.009814814814815| 4.999135802469137| Sredets|
|      739|2018-09-22 00:00:00|   354.0| 42.69399999999973|23.336999999999946|15.673511187607554| 10.29199655765921| Sredets|
|      739|2018-09-23 00:00:00|   354.0| 42.69399999999973| 23.33699999999995|26.578480138169255| 18.87569948186528| Sredets|
|      739|2018-09-24 00:00:00|   354.0| 42.69399999999974| 23.33699999999996| 18.99480496453898|14.145673758865236| Sredets|
|      739|2018-09-25 00:00:00|   354.0|42.693999999999996| 23.33700000000015| 6.394999999999999| 5.132835820895523| S

In [0]:
df.dtypes

Out[110]: [('sensor_id', 'int'),
 ('date', 'timestamp'),
 ('location', 'double'),
 ('lat', 'double'),
 ('lon', 'double'),
 ('PM10', 'double'),
 ('PM2.5', 'double'),
 ('district', 'string')]

In [0]:
df.filter(df.district=='Sredets').show(10)

+---------+-------------------+--------+------------------+------------------+------------------+------------------+--------+
|sensor_id|               date|location|               lat|               lon|              PM10|             PM2.5|district|
+---------+-------------------+--------+------------------+------------------+------------------+------------------+--------+
|      739|2018-09-21 00:00:00|   354.0| 42.69400000000015|23.336999999999993| 8.009814814814815| 4.999135802469137| Sredets|
|      739|2018-09-22 00:00:00|   354.0| 42.69399999999973|23.336999999999946|15.673511187607554| 10.29199655765921| Sredets|
|      739|2018-09-23 00:00:00|   354.0| 42.69399999999973| 23.33699999999995|26.578480138169255| 18.87569948186528| Sredets|
|      739|2018-09-24 00:00:00|   354.0| 42.69399999999974| 23.33699999999996| 18.99480496453898|14.145673758865236| Sredets|
|      739|2018-09-25 00:00:00|   354.0|42.693999999999996| 23.33700000000015| 6.394999999999999| 5.132835820895523| S

In [0]:
# number of districts
df.select("district").distinct().count()

Out[61]: 23

In [0]:
df.groupBy("district").count().show()

+--------------+-----+
|      district|count|
+--------------+-----+
|       Ilinden| 1040|
|      Vrabnits| 2732|
|      Nadezhda| 3056|
|      Lozenets| 6990|
|        Izgrev| 3329|
|      Poduyane| 4554|
|       Serdika| 3699|
|     Oborishte| 3919|
|   Vazrazhdane| 2822|
|Krasna Polyana| 4356|
|    Studentski| 3570|
|       Slatina| 7997|
|    Pancharevo| 5918|
|  Kremikovetsi|  329|
|       Vitosha|15666|
|   Ovcha kupel| 8197|
|     Triaditsa| 7331|
|       Mladost|13087|
|         Iskar| 2976|
|   Krasno selo| 6833|
+--------------+-----+
only showing top 20 rows



In [0]:
df = df.withColumnRenamed("PM2.5","PM2_5")

In [0]:
df.select('PM2_5').describe().show()

+-------+------------------+
|summary|             PM2_5|
+-------+------------------+
|  count|            115463|
|   mean|13.525177464970868|
| stddev|10.848572842869846|
|    min|               0.0|
|    max| 9.999964973730293|
+-------+------------------+



In [0]:
df.columns

Out[66]: ['sensor_id', 'date', 'location', 'lat', 'lon', 'PM10', 'PM2.5', 'district']

In [0]:
df_anova = df.select('district', 'PM2_5')
df_anova.show()

+--------+------------------+
|district|             PM2_5|
+--------+------------------+
| Sredets| 4.999135802469137|
| Sredets| 10.29199655765921|
| Sredets| 18.87569948186528|
| Sredets|14.145673758865236|
| Sredets| 5.132835820895523|
| Sredets|  6.62247191011236|
| Sredets| 7.291923743500864|
| Sredets|7.4012000000000056|
| Sredets|13.440120689655172|
| Sredets|13.577482394366209|
| Sredets| 5.390343053173242|
| Sredets|17.248017241379312|
| Sredets|10.798987993138935|
| Sredets|7.4098757763975165|
| Sredets| 9.270433275563246|
| Sredets|10.183630017452005|
| Sredets|14.663039568345324|
| Sredets|  15.6055304347826|
| Sredets|11.464974182444067|
| Sredets|17.359964664310976|
+--------+------------------+
only showing top 20 rows



In [0]:
# citation: https://gist.github.com/srnghn/4b10aa0c623a4b246509ffb2e8ac3bb2 (reference 9)
# where below code is gathered from 

from pyspark.sql.functions import *

# Implementation of ANOVA function: calculates the degrees of freedom, F-value, eta squared and omega squared values.
# Expects that 'categoryData' with two columns, the first being the categorical independent variable and the second being the scale dependent variable

def getAnovaStats(categoryData): 
    cat_val = categoryData.toDF("cat","value")
    cat_val.createOrReplaceTempView("df")
    newdf = spark.sql("select A.cat, A.value, cast((A.value * A.value) as double) as valueSq, ((A.value - B.avg) * (A.value - B.avg)) as diffSq from df A join (select cat, avg(value) as avg from df group by cat) B where A.cat = B.cat")
    grouped = newdf.groupBy("cat")
    sums = grouped.sum("value")
    counts = grouped.count()
    numCats = counts.count() 
    sumsq = grouped.sum("valueSq") 
    avgs = grouped.avg("value")
    
    totN = counts.selectExpr("sum(count) as total").rdd.map(lambda x: x.total).collect()[0]
    totSum = sums.selectExpr("sum(`sum(value)`) as totSum").rdd.map(lambda x: x.totSum).collect()[0]
    totSumSq = sumsq.selectExpr("sum(`sum(valueSq)`) as totSumSq").rdd.map(lambda x: x.totSumSq).collect()[0]

    totMean = totSum / totN

    dft = totN - 1
    dfb = numCats - 1
    dfw = totN - numCats
    
    joined = counts.selectExpr("cat as category", "count").join(sums, col("category") == sums.cat, 'inner')\
        .drop(sums.cat)\
        .join(sumsq, col("category") == sumsq.cat, 'inner')\
        .drop(sumsq.cat)\
        .join(avgs, col("category") == avgs.cat, 'inner')\
        .drop(avgs.cat)
    finaldf = joined.withColumn("totMean", lit(totMean))

    ssb_tmp = finaldf.rdd.map(lambda x: (x[0], ((x[4] - x[5])*(x[4] - x[5]))*x[1]))
    ssb = ssb_tmp.toDF().selectExpr("sum(_2) as total").rdd.map(lambda x: x.total).collect()[0]

    ssw_tmp = grouped.sum("diffSq")
    ssw = ssw_tmp.selectExpr("sum(`sum(diffSq)`) as total").rdd.map(lambda x: x.total).collect()[0]
    
    sst = ssb + ssw

    msb = ssb / dfb
    msw = ssw / dfw
    F = msb / msw

    etaSq = ssb / sst
    omegaSq = (ssb - ((numCats - 1) * msw))/(sst + msw)
    return (dfb, dfw, F, etaSq, omegaSq)

In [0]:
# anova for testing if differences between districts
getAnovaStats(df_anova) # significant, p-value = 0, there is difference between groups

Out[122]: (22, 115440, 92.74618496686806, 0.017368137920689586, 0.017180726422985654)

In [0]:
getAnovaStats(df.select('district', 'PM10')) # anova of PM10, just to see

Out[8]: (22, 115440, 61.083770062285886, 0.01150709642296107, 0.011318617280102835)

In [0]:
%sql
select * from default.sofiasensordatacleaned_csv;

sensor_id,date,location,lat,lon,PM10,PM2.5,district
739,2018-09-21T00:00:00.000+0000,354.0,42.694,23.337,8.009815,4.999136,Sredets
739,2018-09-22T00:00:00.000+0000,354.0,42.694,23.337,15.6735115,10.291997,Sredets
739,2018-09-23T00:00:00.000+0000,354.0,42.694,23.337,26.57848,18.8757,Sredets
739,2018-09-24T00:00:00.000+0000,354.0,42.694,23.337,18.994804,14.145674,Sredets
739,2018-09-25T00:00:00.000+0000,354.0,42.694,23.337,6.395,5.132836,Sredets
739,2018-09-26T00:00:00.000+0000,354.0,42.694,23.337,10.525618,6.622472,Sredets
739,2018-09-27T00:00:00.000+0000,354.0,42.694,23.337,11.863449,7.2919235,Sredets
739,2018-09-28T00:00:00.000+0000,354.0,42.694,23.337,12.343096,7.4012,Sredets
739,2018-09-29T00:00:00.000+0000,354.0,42.694,23.337,19.102104,13.440121,Sredets
739,2018-09-30T00:00:00.000+0000,354.0,42.694,23.337,17.525475,13.577482,Sredets
