Example taken from https://www.kaggle.com/fatmakursun/pyspark-ml-tutorial-for-beginners

Dataset from https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

In [None]:
!wget https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz -P data/cal_housing/

In [None]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [None]:
# define the schema, corresponding to a line in the csv data file.
schema = StructType([
    StructField("long", FloatType(), nullable=True),
    StructField("lat", FloatType(), nullable=True),
    StructField("medage", FloatType(), nullable=True),
    StructField("totrooms", FloatType(), nullable=True),
    StructField("totbdrms", FloatType(), nullable=True),
    StructField("pop", FloatType(), nullable=True),
    StructField("houshlds", FloatType(), nullable=True),
    StructField("medinc", FloatType(), nullable=True),
    StructField("medhv", FloatType(), nullable=True)]
)

In [None]:
# Load housing data
housing_path='data/cal_housing/CaliforniaHousing/cal_housing.data'
housing_df = spark.read.csv(path=housing_path, schema=schema).cache()

# Exploratory Analysis

In [None]:
housing_df.show()

In [None]:
housing_df.summary().toPandas()

In [None]:
(housing_df.describe().select(
                    "summary",
                    F.round("medage", 4).alias("medage"),
                    F.round("totrooms", 4).alias("totrooms"),
                    F.round("totbdrms", 4).alias("totbdrms"),
                    F.round("pop", 4).alias("pop"),
                    F.round("houshlds", 4).alias("houshlds"),
                    F.round("medinc", 4).alias("medinc"),
                    F.round("medhv", 4).alias("medhv"))
                    .show())

In [None]:
# group by housingmedianage and see the distribution
result_df = housing_df.groupBy("medage").count().sort("medage", ascending=False)
result_df.show(10)

In [None]:
result_df.toPandas().plot.bar(x='medage',figsize=(14, 6))

# Data preparation

In [None]:
# Adjust the values of `medianHouseValue`
reformated_df = housing_df.withColumn("medhv", col("medhv")/100000)

In [None]:
reformated_df.show(2)

In [None]:
# Add the new columns to `df`
reformated_df = (reformated_df.withColumn("rmsperhh", F.round(col("totrooms")/col("houshlds"), 2))
                       .withColumn("popperhh", F.round(col("pop")/col("houshlds"), 2))
                       .withColumn("bdrmsperrm", F.round(col("totbdrms")/col("totrooms"), 2)))

In [None]:
# Inspect the result
reformated_df.show(5)

In [None]:
# Re-order and select columns
reformated_df = reformated_df.select("medhv", 
                              "totbdrms", 
                              "pop", 
                              "houshlds", 
                              "medinc", 
                              "rmsperhh", 
                              "popperhh", 
                              "bdrmsperrm")

In [None]:
reformated_df.show(5)

In [None]:
featureCols = ["totbdrms", "pop", "houshlds", "medinc", "rmsperhh", "popperhh", "bdrmsperrm"]

In [None]:
# put features into a feature vector column
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

In [None]:
assembled_df = assembler.transform(reformated_df)

In [None]:
assembled_df.show(10, truncate=False)

In [None]:
# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

In [None]:
# Fit the DataFrame to the scaler
scaled_df = standardScaler.fit(assembled_df).transform(assembled_df)

In [None]:
# Inspect the result
scaled_df.select("features", "features_scaled").show(10, truncate=False)

# Build and train a model

In [None]:
# Split the data into train and test sets
train_data, test_data = scaled_df.randomSplit([.8,.2], seed=rnd_seed)

In [None]:
# Initialize `lr`
lr = (LinearRegression(featuresCol='features_scaled', labelCol="medhv", predictionCol='predmedhv', 
                               maxIter=10, regParam=0.3, elasticNetParam=0.8, standardization=False))

In [None]:
# Fit the data to the model
linearModel = lr.fit(train_data)

In [None]:
# Coefficients for the model
linearModel.coefficients

In [None]:
# Intercept for the model
linearModel.intercept

# Predictions

In [None]:
# Generate predictions
predictions = linearModel.transform(test_data)

In [None]:
# Extract the predictions and the "known" correct labels
predandlabels = predictions.select("predmedhv", "medhv")

In [None]:
predandlabels.show()

In [None]:
evaluator = RegressionEvaluator(predictionCol="predmedhv", labelCol='medhv', metricName='rmse')
print("RMSE: {0}".format(evaluator.evaluate(predandlabels)))

In [None]:
evaluator = RegressionEvaluator(predictionCol="predmedhv", labelCol='medhv', metricName='r2')
print("R2: {0}".format(evaluator.evaluate(predandlabels)))

# Build Pipeline

In [None]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml import Pipeline, PipelineModel

In [None]:
sqlTrans = SQLTransformer(statement="""
SELECT
medhv/100000 as medhv,
totbdrms, pop, houshlds, medinc, 
round(totrooms/houshlds,2) as rmsperhh,
round(pop/houshlds,2) as popperhh,
round(totbdrms/totrooms,2) as bdrmsperrm
FROM __THIS__"""
)

In [None]:
reformated_df = sqlTrans.transform(housing_df)
reformated_df.show(10)

In [None]:
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

In [None]:
assembled_df = assembler.transform(reformated_df)
assembled_df.show(10, truncate=False)

In [None]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

In [None]:
scaler_model = standardScaler.fit(assembled_df)
scaled_df = scaler_model.transform(assembled_df)
scaled_df.select('features', 'features_scaled').show(10, truncate=False)

In [None]:
#transform_pipeline=PipelineModel(stages=[sqlTrans, assembler])
transform_pipeline=PipelineModel(stages=[sqlTrans, assembler, scaler_model])
#transform_pipeline = pipeline.transform(housing_df)
transform_pipeline

In [None]:
transform_pipeline.transform(housing_df).select('features', 'features_scaled').show(10, truncate=False)

In [None]:
#pipeline = Pipeline(stages=[sqlTrans, assembler, standardScaler, lr])
pipeline = Pipeline(stages=[sqlTrans, assembler, scaler_model, lr])

In [None]:
train_data, test_data = housing_df.randomSplit([.8,.2], seed=rnd_seed)

In [None]:
linModel = pipeline.fit(train_data)

In [None]:
predictions = linModel.transform(test_data)
predictions.select("predmedhv", "medhv").show(10)

In [None]:
evaluator = RegressionEvaluator(predictionCol="predmedhv", labelCol='medhv', metricName='rmse')
print("RMSE: {0}".format(evaluator.evaluate(predictions)))

In [None]:
evaluator = RegressionEvaluator(predictionCol="predmedhv", labelCol='medhv', metricName='r2')
print("R2: {0}".format(evaluator.evaluate(predictions)))