In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import *

In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/snpaya2@illinois.edu/SofiaSensorDataCleaned-3.csv", inferSchema=True)

In [0]:
df = df.withColumnRenamed("PM2.5","PM2_5")

In [0]:
df.dtypes

Out[4]: [('sensor_id', 'int'),
 ('date', 'timestamp'),
 ('location', 'double'),
 ('lat', 'double'),
 ('lon', 'double'),
 ('PM10', 'double'),
 ('PM2_5', 'double'),
 ('district', 'string'),
 ('district_group', 'string'),
 ('month_name', 'string'),
 ('month', 'int')]

In [0]:
# one hot encoding categorical variables
district_indexer = StringIndexer(inputCol="district", outputCol="districtIndex")
district_group_indexer = StringIndexer(inputCol="district_group", outputCol="district_groupIndex")
month_name_indexer = StringIndexer(inputCol="month_name", outputCol="month_nameIndex")

ohe_d_m_encoder = OneHotEncoder(inputCols=["districtIndex", "month_nameIndex"], outputCols=["district_ohe", "month_name_ohe"])
ohe_dg_m_encoder = OneHotEncoder(inputCols=["district_groupIndex", "month_nameIndex"], outputCols=["district_group_ohe", "month_name_ohe"])

In [0]:
model1_vec = VectorAssembler(inputCols=["district_ohe", "month_name_ohe"], outputCol='features_m1')
model2_vec = VectorAssembler(inputCols=["district_group_ohe", "month_name_ohe"], outputCol='features_m2')

In [0]:
# create pipeline
pipeline_m1 = Pipeline(stages=[district_indexer, month_name_indexer, ohe_d_m_encoder, model1_vec])
pipeline_m2 = Pipeline(stages=[district_group_indexer, month_name_indexer, ohe_dg_m_encoder, model2_vec])

df_model1 = pipeline_m1.fit(df).transform(df)
df_model2 = pipeline_m2.fit(df).transform(df)

In [0]:
lr = LinearRegression(labelCol="PM2_5", maxIter=10, regParam=0, fitIntercept=True)
lr_model1 = lr.setFeaturesCol('features_m1').fit(df_model1)
lr_model2 = lr.setFeaturesCol('features_m2').fit(df_model2)

In [0]:
# model 1
m1Summary = lr_model1.summary
print("Model 1 Summary: Regressing PM2.5 on district and month")
print(f"RMSE: {m1Summary.rootMeanSquaredError}")
print(f"R2: {m1Summary.r2}")
print(f"adjR2: {m1Summary.r2adj}")
print(f"Explained Variance: {m1Summary.explainedVariance}")

Model 1 Summary: Regressing PM2.5 on district and month
RMSE: 9.165721623087935
R2: 0.28617481200290884
adjR2: 0.2859707365001851
Explained Variance: 33.680060554257935


In [0]:
df_output1 = lr_model1.transform(df_model1)
params1 = ["intercept"] + [df_output1.select('features_m1').schema[0].metadata.get('ml_attr').get('attrs').get('binary')[i]['name'] for i in range(0,len(lr_model1.coefficients))]

coeffs1 = [lr_model1.intercept] + lr_model1.coefficients.tolist()

m1_output =  [(params1[i], float(coeffs1[i]), float(m1Summary.coefficientStandardErrors[i]), float(m1Summary.pValues[i])) for i in range(0,len(params1))]

In [0]:

m1_output_columns = ["Param","Coefficient", "StandardError","pValue"]
m1_output_df = spark.createDataFrame(data=m1_output, schema=m1_output_columns)

# renaming params
m1_output_df.withColumn("Param", regexp_replace("Param", "district_ohe_", "")).withColumn("Param", regexp_replace("Param", "month_name_ohe_", "")).show(len(params1), False)

+--------------+--------------------+-------------------+----------------------+
|Param         |Coefficient         |StandardError      |pValue                |
+--------------+--------------------+-------------------+----------------------+
|intercept     |11.658967856854964  |0.5108094991036852 |1.5543122344752192E-15|
|Vitosha       |-4.073032415730128  |0.511845092118379  |0.0                   |
|Mladost       |-4.689882603761342  |0.5155637460136282 |7.828295761811432E-4  |
|Ovcha kupel   |-1.7317192849538008 |0.5158196905986334 |7.903446856944996E-4  |
|Slatina       |-1.7312164410091082 |0.5167525466623764 |2.3989699116100383E-12|
|Triaditsa     |-3.6223491729084643 |0.517276166425713  |2.151379074888382E-7  |
|Lozenets      |-2.682631126063915  |0.5175622910638226 |1.1790957543666991E-8 |
|Krasno selo   |-2.9517278762471895 |0.5194150617216817 |0.0                   |
|Pancharevo    |-5.379409066842893  |0.5234376883452552 |0.1923398149370985    |
|Poduyane      |-0.682403680

In [0]:
# model 2
m2Summary = lr_model2.summary
print("Model 2 Summary: Regressing PM2.5 on district group and month")
print(f"RMSE: {m2Summary.rootMeanSquaredError}")
print(f"R2: {m2Summary.r2}")
print(f"adjR2: {m2Summary.r2adj}")
print(f"Explained Variance: {m2Summary.explainedVariance}")

Model 2 Summary: Regressing PM2.5 on district group and month
RMSE: 9.24635733352108
R2: 0.2735597674764172
adjR2: 0.2734653812776606
Explained Variance: 32.19538948703411


In [0]:
df_output2 = lr_model2.transform(df_model2)
params2 = ["intercept"] + [df_output2.select('features_m2').schema[0].metadata.get('ml_attr').get('attrs').get('binary')[i]['name'] for i in range(0,len(lr_model2.coefficients))]

coeffs2 = [lr_model2.intercept] + lr_model2.coefficients.tolist()

m2_output =  [(params2[i], float(coeffs2[i]),  float(m2Summary.coefficientStandardErrors[i]), float(m2Summary.pValues[i])) for i in range(0,len(params2))]


In [0]:
m2_output_columns = ["Param","Coefficient", "StandardError","pValue"]
m2_output_df = spark.createDataFrame(data=m2_output, schema=m2_output_columns)

# renaming params
m2_output_df.withColumn("Param", regexp_replace("Param", "district_group_ohe_", "")).withColumn("Param", regexp_replace("Param", "month_name_ohe_", "")).show(len(params2), False)

+-----------------------------------+--------------------+-------------------+----------------------+
|Param                              |Coefficient         |StandardError      |pValue                |
+-----------------------------------+--------------------+-------------------+----------------------+
|intercept                          |11.67389558038734   |0.5117507826081774 |2.1750493931937953E-5 |
|Early to Mid 20th Century Districts|-2.173044375364615  |0.5124146767280519 |2.4221513683642115E-11|
|Peripheral Districts               |-3.42224596490056   |0.5131375011239615 |5.073719222536965E-13 |
|Socialist Housing Districts        |-3.7069863526575246 |0.519059257739574  |0.20571955445363455   |
|City Center                        |-0.6568334841716403 |0.13298112260284253|0.0                   |
|May                                |-1.6101059974055671 |0.1331735325790791 |0.0                   |
|March                              |3.4719439055065773  |0.1339134256222619 |0.0 