<a href="https://colab.research.google.com/github/Jaejuna/SparkML/blob/main/Spark_ch10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=078b0581df236ffe47343e71cdef94a0e0409b73732545187d2ab39033342821
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

filePath = """/content/drive/MyDrive/BOAZ/엔지/Spark Study/databricks-datasets/learning-spark-v2/sf-airbnb/sf-airbnb-clean.parquet/"""
airbnbDF = spark.read.parquet(filePath)
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price").show(5)

+----------------------+---------------+--------+---------+-----------------+-----+
|neighbourhood_cleansed|      room_type|bedrooms|bathrooms|number_of_reviews|price|
+----------------------+---------------+--------+---------+-----------------+-----+
|      Western Addition|Entire home/apt|     1.0|      1.0|            180.0|170.0|
|        Bernal Heights|Entire home/apt|     2.0|      1.0|            111.0|235.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|             17.0| 65.0|
|        Haight Ashbury|   Private room|     1.0|      4.0|              8.0| 65.0|
|      Western Addition|Entire home/apt|     2.0|      1.5|             27.0|785.0|
+----------------------+---------------+--------+---------+-----------------+-----+
only showing top 5 rows



In [15]:
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=32)
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

There are 5752 rows in the training set, and 1394 in the test set


In [20]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

+--------+--------+-----+
|bedrooms|features|price|
+--------+--------+-----+
|     1.0|   [1.0]|200.0|
|     1.0|   [1.0]|130.0|
|     1.0|   [1.0]| 85.0|
|     1.0|   [1.0]| 95.0|
|     3.0|   [3.0]|250.0|
|     1.0|   [1.0]| 45.0|
|     1.0|   [1.0]|115.0|
|     1.0|   [1.0]| 70.0|
|     1.0|   [1.0]|105.0|
|     1.0|   [1.0]| 86.0|
+--------+--------+-----+
only showing top 10 rows



In [21]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='price')
lrModel = lr.fit(vecTrainDF)

In [22]:
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""")

The formula for the linear regression line is price = 121.14*bedrooms + 52.02


In [23]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

In [24]:
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)

+--------+--------+-----+------------------+
|bedrooms|features|price|        prediction|
+--------+--------+-----+------------------+
|     1.0|   [1.0]|250.0| 173.1567995347388|
|     1.0|   [1.0]|250.0| 173.1567995347388|
|     1.0|   [1.0]|100.0| 173.1567995347388|
|     1.0|   [1.0]|159.0| 173.1567995347388|
|     2.0|   [2.0]|199.0|294.29848575905436|
|     1.0|   [1.0]|100.0| 173.1567995347388|
|     1.0|   [1.0]|190.0| 173.1567995347388|
|     1.0|   [1.0]| 95.0| 173.1567995347388|
|     3.0|   [3.0]|200.0|415.44017198336985|
|     1.0|   [1.0]| 64.0| 173.1567995347388|
+--------+--------+-----+------------------+
only showing top 10 rows



In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="price",
    metricName='rmse'
)
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

RMSE is 164.1


In [None]:
pipelinePath = "/tmp/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

In [None]:
# 저장된 모델을 로드할 대 로드할 모델 유형을 다시 지정해야 한다.
from pyspark.ml import PipelineModel
savedPipelineModel = PipelineModel.load(pipelinePath)

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol='price')

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages = [StringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages=stages)
dt.setMaxBins(40)
pipelineModel = pipeline.fit(trainDF)

In [None]:
import pandas as pd

featureImp = pd.DataFrame(list(zip(vecAssember.getInputCols(), dtModel.featureImportances)),colums=["feature", "importance"])
featureImp.sort_values(by='importance', ascending=False)

In [None]:
pipeline = Pipline(stages = [stringIndexer, vecAssembler, rf])

In [None]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder().addGrid(rf.maxDepth, [2, 4, 6]).addGrid(rf.numTrees, [10, 100]).build())

In [None]:
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

In [None]:
from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3, seed=32)
cvModel = cv.fit(trainDF)

In [None]:
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

In [None]:
cvModel = cv.setParallelism(4).fit(trainDF)

In [None]:
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3, parallelism=4, seed=32)
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, cv])
pipelineModel = pipeline.fit(trainDF)