In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://172.16.18.84:4040
SparkContext available as 'sc' (version = 3.5.0, master = local[*], app id = local-1701475373834)
SparkSession available as 'spark'


import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession


In [5]:
val spark = SparkSession
    .builder
    .appName("chapter10")
    .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3d0ba745


In [6]:
val file_path = "./data/sf-airbnb/sf-airbnb-clean.parquet/"

file_path: String = ./data/sf-airbnb/sf-airbnb-clean.parquet/


In [7]:
val airbnbDF = spark.read.parquet(file_path)

airbnbDF: org.apache.spark.sql.DataFrame = [host_is_superhost: string, cancellation_policy: string ... 32 more fields]


In [10]:
airbnbDF.select("neighbourhood_cleansed", "room_type","bedrooms", "bathrooms",
               "number_of_reviews", "price").show(5)

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



In [11]:
val Array(trainDF, testDF) = airbnbDF.randomSplit(Array(0.8, 0.2), seed = 42)
println(f"""there are ${trainDF.count} rows in the training set, and ${testDF.count} in the test set""")

there are 5780 rows in the training set, and 1366 in the test set


trainDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [host_is_superhost: string, cancellation_policy: string ... 32 more fields]
testDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [host_is_superhost: string, cancellation_policy: string ... 32 more fields]


In [12]:
import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.feature.VectorAssembler


In [15]:
val vecAssembler = new VectorAssembler()
    .setInputCols(Array("bedrooms"))
    .setOutputCol("features")

val vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 95.0|
|     1.0|   [1.0]|250.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
|     1.0|   [1.0]|100.0|
|     2.0|   [2.0]|220.0|
+--------+--------+-----+
only showing top 10 rows



vecAssembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_a5e5214214df, handleInvalid=error, numInputCols=1
vecTrainDF: org.apache.spark.sql.DataFrame = [host_is_superhost: string, cancellation_policy: string ... 33 more fields]


In [16]:
import org.apache.spark.ml.regression.LinearRegression

import org.apache.spark.ml.regression.LinearRegression


In [17]:
val lr = new LinearRegression()
    .setFeaturesCol("features")
    .setLabelCol("price")

val lr_model = lr.fit(vecTrainDF)

lr: org.apache.spark.ml.regression.LinearRegression = linReg_8cff2a6dd20a
lr_model: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_8cff2a6dd20a, numFeatures=1


In [18]:
val m = lr_model.coefficients(0)
val b = lr_model.intercept

println(f"""the formula for the linear regression line is price = $m%1.2f * bedroom + $b%1.2f""")

the formula for the linear regression line is price = 123.68 * bedroom + 47.51


m: Double = 123.6757463819947
b: Double = 47.51023373378815


In [19]:
import org.apache.spark.ml.Pipeline

import org.apache.spark.ml.Pipeline


In [20]:
val pipeline = new Pipeline().setStages(Array(vecAssembler, lr))
val pipeline_model = pipeline.fit(trainDF)

pipeline: org.apache.spark.ml.Pipeline = pipeline_dded56c77b50
pipeline_model: org.apache.spark.ml.PipelineModel = pipeline_dded56c77b50


In [22]:
val pred_df = pipeline_model.transform(testDF)
pred_df.select("bedrooms", "features", "price", "prediction").show(10)

+--------+--------+------+------------------+
|bedrooms|features| price|        prediction|
+--------+--------+------+------------------+
|     1.0|   [1.0]|  85.0|171.18598011578285|
|     1.0|   [1.0]|  45.0|171.18598011578285|
|     1.0|   [1.0]|  70.0|171.18598011578285|
|     1.0|   [1.0]| 128.0|171.18598011578285|
|     1.0|   [1.0]| 159.0|171.18598011578285|
|     2.0|   [2.0]| 250.0|294.86172649777757|
|     1.0|   [1.0]|  99.0|171.18598011578285|
|     1.0|   [1.0]|  95.0|171.18598011578285|
|     1.0|   [1.0]| 100.0|171.18598011578285|
|     1.0|   [1.0]|2010.0|171.18598011578285|
+--------+--------+------+------------------+
only showing top 10 rows



pred_df: org.apache.spark.sql.DataFrame = [host_is_superhost: string, cancellation_policy: string ... 34 more fields]


In [23]:
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}

import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}


In [24]:
trainDF.dtypes

res7: Array[(String, String)] = Array((host_is_superhost,StringType), (cancellation_policy,StringType), (instant_bookable,StringType), (host_total_listings_count,DoubleType), (neighbourhood_cleansed,StringType), (latitude,DoubleType), (longitude,DoubleType), (property_type,StringType), (room_type,StringType), (accommodates,DoubleType), (bathrooms,DoubleType), (bedrooms,DoubleType), (beds,DoubleType), (bed_type,StringType), (minimum_nights,DoubleType), (number_of_reviews,DoubleType), (review_scores_rating,DoubleType), (review_scores_accuracy,DoubleType), (review_scores_cleanliness,DoubleType), (review_scores_checkin,DoubleType), (review_scores_communication,DoubleType), (review_scores_location,DoubleType), (review_scores_value,DoubleType), (price,DoubleType), (bedrooms_na,DoubleType), (b...


In [25]:
val categoricalCols = trainDF.dtypes.filter(_._2 == "StringType").map(_._1)

categoricalCols: Array[String] = Array(host_is_superhost, cancellation_policy, instant_bookable, neighbourhood_cleansed, property_type, room_type, bed_type)


In [27]:
val indexOutputCols = categoricalCols.map(_ + "INDEX")

indexOutputCols: Array[String] = Array(host_is_superhostINDEX, cancellation_policyINDEX, instant_bookableINDEX, neighbourhood_cleansedINDEX, property_typeINDEX, room_typeINDEX, bed_typeINDEX)


In [28]:
val oheOutputCols = categoricalCols.map(_ + "OHE")

oheOutputCols: Array[String] = Array(host_is_superhostOHE, cancellation_policyOHE, instant_bookableOHE, neighbourhood_cleansedOHE, property_typeOHE, room_typeOHE, bed_typeOHE)


In [30]:
val stringIndexer = new StringIndexer()
        .setInputCols(categoricalCols)
        .setOutputCols(indexOutputCols)
        .setHandleInvalid("skip")

stringIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_1d6b02434d3d


In [33]:
val oheEncoder = new OneHotEncoder()
    .setInputCols(indexOutputCols)
    .setOutputCols(oheOutputCols)

oheEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_bfda03fb8fe8


In [37]:
val numericCols = trainDF.dtypes.filter{case(field, dataType) => 
    dataType == "DoubleType" && field != "price"}.map(_._1)

numericCols: Array[String] = Array(host_total_listings_count, latitude, longitude, accommodates, bathrooms, bedrooms, beds, minimum_nights, number_of_reviews, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, bedrooms_na, bathrooms_na, beds_na, review_scores_rating_na, review_scores_accuracy_na, review_scores_cleanliness_na, review_scores_checkin_na, review_scores_communication_na, review_scores_location_na, review_scores_value_na)


In [41]:
oheOutputCols

res14: Array[String] = Array(host_is_superhostOHE, cancellation_policyOHE, instant_bookableOHE, neighbourhood_cleansedOHE, property_typeOHE, room_typeOHE, bed_typeOHE)


In [42]:
val assemblerInputs = oheOutputCols ++ numericCols

assemblerInputs: Array[String] = Array(host_is_superhostOHE, cancellation_policyOHE, instant_bookableOHE, neighbourhood_cleansedOHE, property_typeOHE, room_typeOHE, bed_typeOHE, host_total_listings_count, latitude, longitude, accommodates, bathrooms, bedrooms, beds, minimum_nights, number_of_reviews, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, bedrooms_na, bathrooms_na, beds_na, review_scores_rating_na, review_scores_accuracy_na, review_scores_cleanliness_na, review_scores_checkin_na, review_scores_communication_na, review_scores_location_na, review_scores_value_na)


In [43]:
val vecAssemble = new VectorAssembler()
        .setInputCols(assemblerInputs)
        .setOutputCol("feature")

vecAssemble: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_a2ba7d18a1b0, handleInvalid=error, numInputCols=33


In [44]:
val pipeline_path = "lr_model_path"

pipeline_path: String = lr_model_path


In [46]:
pipeline_model.write.overwrite().save(pipeline_path)