In [3]:
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # ensure time zone isn't changed to Australian
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [4]:
yellow = pd.read_parquet('../../mast30034-project-1-janggani/data/curated/yellow/yellow_w.parquet')
green = pd.read_parquet('../../mast30034-project-1-janggani/data/curated/green/green_w.parquet')

yellow_use = yellow[["temp_f", "wind speed", "distance"]]
green_use = green[["temp_f", "wind speed", "distance"]]

In [6]:
sqlContext = SQLContext(spark)

In [7]:
yellow_data = sqlContext.createDataFrame(yellow_use)
green_data = sqlContext.createDataFrame(green_use)

In [8]:
features = ["temp_f", "wind speed"]

va = VectorAssembler(inputCols = features, outputCol='features')
va_y = va.transform(yellow_data)
va_g = va.transform(green_data)
va_y = va_y.select(['features', 'distance'])
va_g = va_g.select(['features', 'distance'])

In [9]:
train_y, test_y = va_y.randomSplit([0.9,0.1])
train_g, test_g = va_g.randomSplit([0.9,0.1])
glr=GeneralizedLinearRegression(labelCol="distance",family="poisson",maxIter=10,regParam=0.3)

In [12]:
model_y = glr.fit(train_y)
model_g = glr.fit(train_g)

print(str(model_y.summary))

Coefficients:
    Feature Estimate Std Error T Value P Value
(Intercept)   1.2249    0.0373 32.8202  0.0000
     temp_f   0.0001    0.0006  0.0958  0.9237
 wind speed  -0.0051    0.0017 -2.9232  0.0035

(Dispersion parameter for poisson family taken to be 1.0000)
    Null deviance: 589.3991 on 3951 degrees of freedom
Residual deviance: 579.2564 on 3951 degrees of freedom
AIC: 12588.0649


In [13]:
print(str(model_g.summary))

Coefficients:
    Feature Estimate Std Error T Value P Value
(Intercept)   1.5035    0.0363 41.4266  0.0000
     temp_f  -0.0048    0.0006 -8.5846  0.0000
 wind speed  -0.0044    0.0017 -2.5895  0.0096

(Dispersion parameter for poisson family taken to be 1.0000)
    Null deviance: 862.2615 on 3912 degrees of freedom
Residual deviance: 787.7896 on 3912 degrees of freedom
AIC: 12879.9138


In [14]:
tdata_y = model_y.transform(test_y)
tdata_g = model_g.transform(test_g)

In [15]:
#mean absolute error
mae = RegressionEvaluator(labelCol="distance", predictionCol="prediction", metricName="mae")
mae_g = mae.evaluate(tdata_g)
mae_y = mae.evaluate(tdata_y)

In [16]:
# mean standard error
mse = RegressionEvaluator(labelCol="distance", predictionCol="prediction", metricName="mse")
mse_g = mse.evaluate(tdata_g)
mse_y = mse.evaluate(tdata_y)

In [17]:
# r squared
r2 = RegressionEvaluator(labelCol="distance", predictionCol="prediction", metricName="r2")
r2_g = r2.evaluate(tdata_g)
r2_y = r2.evaluate(tdata_y)

In [18]:
print(mae_g, mse_g, r2_g)

0.6202482800827293 0.7779631283523001 0.09204806102665819


In [19]:
print(mae_y, mse_y, r2_y)

0.5118804458424704 0.4893811498000824 -0.005010740557852111
