In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/snpaya2@illinois.edu/SofiaSensorDataCleaned-3.csv", inferSchema=True)

In [0]:
df = df.withColumnRenamed("PM2.5","PM2_5")

In [0]:
df.dtypes

Out[31]: [('sensor_id', 'int'),
 ('date', 'timestamp'),
 ('location', 'double'),
 ('lat', 'double'),
 ('lon', 'double'),
 ('PM10', 'double'),
 ('PM2_5', 'double'),
 ('district', 'string'),
 ('district_group', 'string'),
 ('month_name', 'string'),
 ('month', 'int')]

In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

In [0]:
# one hot encoding categorical variables
district_indexer = StringIndexer(inputCol="district", outputCol="districtIndex")
district_group_indexer = StringIndexer(inputCol="district_group", outputCol="district_groupIndex")
month_name_indexer = StringIndexer(inputCol="month_name", outputCol="month_nameIndex")

ohe_d_m_encoder = OneHotEncoder(inputCols=["districtIndex", "month_nameIndex"], outputCols=["district_ohe", "month_name_ohe"])
ohe_dg_m_encoder = OneHotEncoder(inputCols=["district_groupIndex", "month_nameIndex"], outputCols=["district_group_ohe", "month_name_ohe"])

In [0]:
model1_vec = VectorAssembler(inputCols=["district_ohe", "month_name_ohe"], outputCol='features_m1')
model2_vec = VectorAssembler(inputCols=["district_group_ohe", "month_name_ohe"], outputCol='features_m2')

In [0]:
# create pipeline
pipeline_m1 = Pipeline(stages=[district_indexer, month_name_indexer, ohe_d_m_encoder, model1_vec])
pipeline_m2 = Pipeline(stages=[district_group_indexer, month_name_indexer, ohe_dg_m_encoder, model2_vec])

df_model1 = pipeline_m1.fit(df).transform(df)
df_model2 = pipeline_m2.fit(df).transform(df)

In [0]:
lr = LinearRegression(labelCol="PM2_5", maxIter=10, regParam=0, fitIntercept=True)
lr_model1 = lr.setFeaturesCol('features_m1').fit(df_model1)
lr_model2 = lr.setFeaturesCol('features_m2').fit(df_model2)

In [0]:
# model 1
m1Summary = lr_model1.summary
print("Model 1 Summary: Regressing PM2.5 on district and month")
print(f"RMSE: {m1Summary.rootMeanSquaredError}")
print(f"R2: {m1Summary.r2}")
print(f"adjR2: {m1Summary.r2adj}")
print(f"Explained Variance: {m1Summary.explainedVariance}")

Model 1 Summary: Regressing PM2.5 on district and month
RMSE: 9.165721623087935
R2: 0.28617481200290884
adjR2: 0.2859707365001851
Explained Variance: 33.680060554257935


In [0]:
df_output1 = lr_model1.transform(df_model1)
params1 = ["intercept"] + [df_output1.select('features_m1').schema[0].metadata.get('ml_attr').get('attrs').get('binary')[i]['name'] for i in range(0,len(lr_model1.coefficients))]

coeffs1 = [lr_model1.intercept] + lr_model1.coefficients.tolist()

m1_output =  [(params1[i], round(float(coeffs1[i]),3),  round(float(m1Summary.coefficientStandardErrors[i]),3), round(float(m1Summary.pValues[i]),3)) for i in range(0,len(params1))]

In [0]:
m1_output_columns = ["Param","Coefficient", "StandardError","pValue"]
m1_output_df = spark.createDataFrame(data=m1_output, schema=m1_output_columns)

# renaming params
m1_output_df.withColumn("Param", regexp_replace("Param", "district_ohe_", "")).withColumn("Param", regexp_replace("Param", "month_name_ohe_", "")).show(len(params1), False)

+--------------+-----------+-------------+------+
|Param         |Coefficient|StandardError|pValue|
+--------------+-----------+-------------+------+
|intercept     |11.659     |0.511        |0.0   |
|Vitosha       |-4.073     |0.512        |0.0   |
|Mladost       |-4.69      |0.516        |0.001 |
|Ovcha kupel   |-1.732     |0.516        |0.001 |
|Slatina       |-1.731     |0.517        |0.0   |
|Triaditsa     |-3.622     |0.517        |0.0   |
|Lozenets      |-2.683     |0.518        |0.0   |
|Krasno selo   |-2.952     |0.519        |0.0   |
|Pancharevo    |-5.379     |0.523        |0.192 |
|Poduyane      |-0.682     |0.524        |0.233 |
|Krasna Polyana|-0.625     |0.526        |0.155 |
|Oborishte     |0.749      |0.528        |0.123 |
|Serdika       |-0.813     |0.528        |0.0   |
|Studentski    |-3.124     |0.529        |0.0   |
|Lyulin        |-3.058     |0.53         |0.0   |
|Izgrev        |-3.387     |0.532        |0.001 |
|Nadezhda      |-1.794     |0.533        |0.0   |


In [0]:
# model 2
m2Summary = lr_model2.summary
print("Model 2 Summary: Regressing PM2.5 on district group and month")
print(f"RMSE: {m2Summary.rootMeanSquaredError}")
print(f"R2: {m2Summary.r2}")
print(f"adjR2: {m2Summary.r2adj}")
print(f"Explained Variance: {m2Summary.explainedVariance}")

Model 2 Summary: Regressing PM2.5 on district group and month
RMSE: 9.24635733352108
R2: 0.2735597674764173
adjR2: 0.27346538127766074
Explained Variance: 32.19538948703418


In [0]:
df_output2 = lr_model2.transform(df_model2)
params2 = ["intercept"] + [df_output2.select('features_m2').schema[0].metadata.get('ml_attr').get('attrs').get('binary')[i]['name'] for i in range(0,len(lr_model2.coefficients))]

coeffs2 = [lr_model2.intercept] + lr_model2.coefficients.tolist()

m2_output =  [(params2[i], round(float(coeffs2[i]),3),  round(float(m2Summary.coefficientStandardErrors[i]),3), round(float(m2Summary.pValues[i]),3)) for i in range(0,len(params2))]

In [0]:
m2_output_columns = ["Param","Coefficient", "StandardError","pValue"]
m2_output_df = spark.createDataFrame(data=m2_output, schema=m2_output_columns)

# renaming params
m2_output_df.withColumn("Param", regexp_replace("Param", "district_group_ohe_", "")).withColumn("Param", regexp_replace("Param", "month_name_ohe_", "")).show(len(params2), False)

+-----------------------------------+-----------+-------------+------+
|Param                              |Coefficient|StandardError|pValue|
+-----------------------------------+-----------+-------------+------+
|intercept                          |11.674     |0.512        |0.0   |
|Early to Mid 20th Century Districts|-2.173     |0.512        |0.0   |
|Peripheral Districts               |-3.422     |0.513        |0.0   |
|Socialist Housing Districts        |-3.707     |0.519        |0.206 |
|City Center                        |-0.657     |0.133        |0.0   |
|May                                |-1.61      |0.133        |0.0   |
|March                              |3.472      |0.134        |0.0   |
|April                              |1.49       |0.134        |0.33  |
|June                               |-0.131     |0.137        |0.0   |
|January                            |12.693     |0.137        |0.0   |
|February                           |7.696      |0.139        |0.013 |
|Augus