In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [2]:
import os
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("h6")\
    .config("spark.mongodb.input.uri", "mongodb://54.186.55.238/mydb4.air")\
    .getOrCreate()

In [4]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [5]:

df2 = df.withColumn("aqi_co", df["aqi_co"].cast(DoubleType()))\
        .withColumn("aqi_no2", df["aqi_no2"].cast(DoubleType()))\
        .withColumn("aqi_o3", df["aqi_o3"].cast(DoubleType()))\
        .withColumn("aqi_so2", df["aqi_so2"].cast(DoubleType()))\
        .withColumn("aqi_pm10", df["aqi_pm10"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_co", df["arithmetic_mean_co"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_no2", df["arithmetic_mean_no2"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_o3", df["arithmetic_mean_o3"].cast(DoubleType()))\
        .withColumn("max_aqi", df["max_aqi"].cast(DoubleType()))\
        .withColumn("state_code", df["state_code"].cast(StringType()))\
        .withColumn("aqi_pm25_frm", df["aqi_pm25_frm"].cast(DoubleType()))\
        .withColumn("aqi_pm25_nonfrm", df["aqi_pm25_nonfrm"].cast(DoubleType()))\
        .withColumn("aqi_pm25_speciation", df["aqi_pm25_speciation"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_pressure", df["arithmetic_mean_pressure"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_wind", df["arithmetic_mean_wind"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_so2", df["arithmetic_mean_so2"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_pm10", df["arithmetic_mean_pm10"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_pm25_frm", df["arithmetic_mean_pm25_frm"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_pm25_nonfrm", df["arithmetic_mean_pm25_nonfrm"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_pm25_speciation", df["arithmetic_mean_pm25_speciation"].cast(DoubleType()))\
        .withColumn("arithmetic_mean_temperature", df["arithmetic_mean_temperature"].cast(DoubleType()))\
        .withColumn("date_local", df["date_local"].cast(StringType()))
        

In [6]:
from pyspark.storagelevel import StorageLevel

common_key1 = ['county_code','city_name','county_name','date_local']


In [7]:
df2 = df2.fillna(0)

In [29]:
df2.columns

['_id',
 'aqi_co',
 'aqi_no2',
 'aqi_o3',
 'aqi_pm10',
 'aqi_pm25_frm',
 'aqi_pm25_nonfrm',
 'aqi_pm25_speciation',
 'aqi_so2',
 'arithmetic_mean_co',
 'arithmetic_mean_no2',
 'arithmetic_mean_o3',
 'arithmetic_mean_pm10',
 'arithmetic_mean_pm25_frm',
 'arithmetic_mean_pm25_nonfrm',
 'arithmetic_mean_pm25_speciation',
 'arithmetic_mean_pressure',
 'arithmetic_mean_so2',
 'arithmetic_mean_temperature',
 'arithmetic_mean_wind',
 'cbsa_name',
 'city_name',
 'county_code',
 'county_name',
 'date_local',
 'latitude',
 'longitude',
 'max_aqi',
 'site_num',
 'state_code']

In [10]:
from pyspark.sql import functions as F


df3 = df2.groupBy(common_key1)\
   .agg(F.max("aqi_co").alias("aqi_co"),
        F.max("aqi_no2").alias("aqi_no2"),
        F.max("aqi_o3").alias("aqi_o3"),
        F.max("aqi_so2").alias("aqi_so2"),
        F.max("aqi_pm10").alias("aqi_pm10"),
        F.max("arithmetic_mean_co").alias("arithmetic_mean_co"),
        F.max("arithmetic_mean_no2").alias("arithmetic_mean_no2"),
        F.max("arithmetic_mean_o3").alias("arithmetic_mean_o3"),
        F.max("max_aqi").alias("max_aqi"),
        F.max("aqi_pm25_frm").alias("aqi_pm25_frm"),
        F.max("aqi_pm25_nonfrm").alias("aqi_pm25_nonfrm"),
        F.max("aqi_pm25_speciation").alias("aqi_pm25_speciation"),
        F.max("arithmetic_mean_pressure").alias("arithmetic_mean_pressure"),
        F.max("arithmetic_mean_wind").alias("arithmetic_mean_wind"),
        F.max("arithmetic_mean_so2").alias("arithmetic_mean_so2"),
        F.max("arithmetic_mean_pm10").alias("arithmetic_mean_pm10"),
        F.max("arithmetic_mean_pm25_frm").alias("arithmetic_mean_pm25_frm"),
        F.max("arithmetic_mean_pm25_nonfrm").alias("arithmetic_mean_pm25_nonfrm"),
        F.max("arithmetic_mean_pm25_speciation").alias("arithmetic_mean_pm25_speciation"),
        F.max("arithmetic_mean_temperature").alias("arithmetic_mean_temperature"),
        F.mean("latitude").alias("latitude"),
        F.mean("longitude").alias("longitude"))
        


In [12]:
common_key = ['county_code','city_name','county_name']

df4 = df3.select('city_name','county_name','county_code','date_local','max_aqi','latitude','longitude',
                 lag('max_aqi',2).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_2_max_aqi'),
                 lag('max_aqi',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_max_aqi'),
                 lag('aqi_co',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_co'),
                 lag('aqi_no2',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_no2'),
                 lag('aqi_o3',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_o3'),
                 lag('aqi_pm10',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_pm10'),
                 lag('aqi_pm25_frm',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_pm25_frm'),
                 lag('aqi_pm25_nonfrm',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_pm25_nonfrm'),
                 lag('aqi_pm25_speciation',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_pm25_speciation'),
                 lag('aqi_so2',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_aqi_so2'),
                 lag('arithmetic_mean_co',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_co'),
                 lag('arithmetic_mean_no2',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_no2'),
                 lag('arithmetic_mean_o3',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_o3'),
                 lag('arithmetic_mean_pm10',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_pm10'),
                 lag('arithmetic_mean_pm25_frm',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_pm25_frm'),
                 lag('arithmetic_mean_pm25_nonfrm',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_pm25_nonfrm'),
                 lag('arithmetic_mean_pm25_speciation',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_pm25_speciation'),
                 lag('arithmetic_mean_pressure',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_pressure'),
                 lag('arithmetic_mean_so2',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_so2'),
                 lag('arithmetic_mean_temperature',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_temperature'),
                 lag('arithmetic_mean_wind',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('lag_1_arithmetic_mean_wind'),
                 lead('max_aqi',1).over(Window.partitionBy(common_key).orderBy('date_local')).alias('label')
                ).persist(StorageLevel.DISK_ONLY)

In [13]:
#This should be the table we can model on
df5 = df4.withColumn("trend1", col("max_aqi")/col("lag_1_max_aqi"))\
         .withColumn("trend2",col("max_aqi")/col("lag_2_max_aqi"))

In [14]:
df5.columns

['city_name',
 'county_name',
 'county_code',
 'date_local',
 'max_aqi',
 'latitude',
 'longitude',
 'lag_2_max_aqi',
 'lag_1_max_aqi',
 'lag_1_aqi_co',
 'lag_1_aqi_no2',
 'lag_1_aqi_o3',
 'lag_1_aqi_pm10',
 'lag_1_aqi_pm25_frm',
 'lag_1_aqi_pm25_nonfrm',
 'lag_1_aqi_pm25_speciation',
 'lag_1_aqi_so2',
 'lag_1_arithmetic_mean_co',
 'lag_1_arithmetic_mean_no2',
 'lag_1_arithmetic_mean_o3',
 'lag_1_arithmetic_mean_pm10',
 'lag_1_arithmetic_mean_pm25_frm',
 'lag_1_arithmetic_mean_pm25_nonfrm',
 'lag_1_arithmetic_mean_pm25_speciation',
 'lag_1_arithmetic_mean_pressure',
 'lag_1_arithmetic_mean_so2',
 'lag_1_arithmetic_mean_temperature',
 'lag_1_arithmetic_mean_wind',
 'label',
 'trend1',
 'trend2']

In [46]:
df5.show(2)

+-----------+-----------+-----------+----------+-------+------------------+-----------+-------------+-------------+------------+-------------+------------+--------------+------------------+---------------------+-------------------------+-------------+------------------------+-------------------------+------------------------+--------------------------+------------------------------+---------------------------------+-------------------------------------+------------------------------+-------------------------+---------------------------------+--------------------------+-----+------+------+
|  city_name|county_name|county_code|date_local|max_aqi|          latitude|  longitude|lag_2_max_aqi|lag_1_max_aqi|lag_1_aqi_co|lag_1_aqi_no2|lag_1_aqi_o3|lag_1_aqi_pm10|lag_1_aqi_pm25_frm|lag_1_aqi_pm25_nonfrm|lag_1_aqi_pm25_speciation|lag_1_aqi_so2|lag_1_arithmetic_mean_co|lag_1_arithmetic_mean_no2|lag_1_arithmetic_mean_o3|lag_1_arithmetic_mean_pm10|lag_1_arithmetic_mean_pm25_frm|lag_1_arithmetic_me

In [15]:
df5 = df5.fillna(0)

In [16]:
feat_cols = ['latitude',
 'longitude',
 'lag_2_max_aqi',
 'lag_1_max_aqi',
 'lag_1_aqi_co',
 'lag_1_aqi_no2',
 'lag_1_aqi_o3',
 'lag_1_aqi_pm10',
 'lag_1_aqi_pm25_frm',
 'lag_1_aqi_pm25_nonfrm',
 'lag_1_aqi_pm25_speciation',
 'lag_1_aqi_so2',
 'lag_1_arithmetic_mean_co',
 'lag_1_arithmetic_mean_no2',
 'lag_1_arithmetic_mean_o3',
 'lag_1_arithmetic_mean_pm10',
 'lag_1_arithmetic_mean_pm25_frm',
 'lag_1_arithmetic_mean_pm25_nonfrm',
 'lag_1_arithmetic_mean_pm25_speciation',
 'lag_1_arithmetic_mean_pressure',
 'lag_1_arithmetic_mean_so2',
 'lag_1_arithmetic_mean_temperature',
 'lag_1_arithmetic_mean_wind',
 'trend1',
 'trend2']

In [17]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", 
                     inputCols= feat_cols) #except the last col.
points = va.transform(df5).select("features", "label")

In [18]:
points.show(2)

KeyboardInterrupt: 

In [19]:

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer


(trainingData, testData) = points.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol='features')

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 6.850155673906531|  0.0|(25,[0,1],[20.869...|
| 6.850155673906531| 33.0|(25,[0,1],[29.566...|
| 6.850155673906531| 25.0|(25,[0,1],[39.044...|
|49.060331050213144| 90.0|(25,[0,1,2,3,4,6,...|
|42.616309976261604| 49.0|(25,[0,1,2,3,4,6,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 16.9736


In [20]:
rfModel = model.stages
rfModel

[RandomForestRegressionModel (uid=RandomForestRegressor_e278a4050029) with 20 trees]

## gradient boosted regressor

In [21]:
from pyspark.ml.regression import GBTRegressor

featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(points)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = points.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 8.674398254893196| 25.0|(25,[0,1],[39.044...|
|45.076324372669056| 61.0|(25,[0,1,2,3,4,6,...|
| 50.96604260026299| 90.0|(25,[0,1,2,3,4,6,...|
| 45.06403905726718| 49.0|(25,[0,1,2,3,4,6,...|
| 46.32391912756721| 48.0|(25,[0,1,2,3,4,6,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 16.7004


In [None]:
gbtModel = model.stages
print(gbtModel)  # summary only

## linear regression

In [27]:
%%time

from pyspark.ml.regression import GeneralizedLinearRegression
(trainingData, testData) = points.randomSplit([0.7, 0.3])


glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(trainingData)

predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Summarize the model over the training set and print out some metrics
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 48.44707478286031| 43.0|(25,[0,1,2,3,4,6,...|
|34.344694824779104| 61.0|(25,[0,1,2,3,4,6,...|
| 42.97795781771807| 49.0|(25,[0,1,2,3,4,6,...|
| 33.82344034699529| 32.0|(25,[0,1,2,3,4,6,...|
| 48.56787529106251| 56.0|(25,[0,1,2,3,4,6,...|
+------------------+-----+--------------------+
only showing top 5 rows

CPU times: user 29.2 ms, sys: 9.51 ms, total: 38.7 ms
Wall time: 9.86 s


In [28]:
print(rmse)

17.282204923815453


In [23]:
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Coefficient Standard Errors: [0.004568040400879666, 0.0014871929388381047, 0.001484891172149681, 0.0022384439061993804, 0.04282426650590862, 0.00863096371490406, 0.0032719989628260484, 0.008537108154305889, 0.003716867170041703, 0.0028065298044307064, inf, 0.0047288031997156435, 0.6837661971612188, 0.015985919171721034, 3.833739638776442, 0.007316721161005434, 0.012281027870125145, 0.007462366203326255, 0.0003275302824165198, 3.0801268015344454e-05, 0.016345494311850447, 0.0011032249410030373, 0.000623303551940379, 0.014775824549233072, 0.012147544212385166, 0.22332216788275847]
T Values: [-22.52880331282746, -52.814844270809566, 130.51278177163414, 117.06806269887257, -1.1192841377833957, 5.2446007440106035, 26.196904923327278, -19.4493667648788, 18.97521813940085, 13.831348580910534, 0.0, -1.5828533696215537, -0.17227555212354245, 7.6663912693500675, 48.72856351650565, 10.408182995441974, 4.3504721157136155, 31.445598119174655, 5.664590329165979, 1.2591599335277721, 7.640363964356712

In [26]:
summary.Rmse

AttributeError: 'GeneralizedLinearRegressionTrainingSummary' object has no attribute 'Rmse'