<a href="https://colab.research.google.com/github/Jaejuna/SparkML/blob/main/Spark_MLlib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
# 데이터 다운
!gdown https://drive.google.com/uc?id=1qYh2aGOBZfUF9547M0pxAe17JhR7yLxm
!unzip sf-airbnb-clean.parquet.zip

In [None]:
# 데이터 수집 및 탐색색
import pyspark
from pyspark.sql import SparkSession

#Create SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

filePath = "./sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price").show(5)

In [None]:
# 훈련 데이터셋과 테스트 데이터셋 분리리
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=32)
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

In [None]:
# 변환기 transformer 사용하여 학습할 기능들 준비비
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

In [47]:
# Linear Regression 모델 생성 및 학습습
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='price')
lrModel = lr.fit(vecTrainDF)

In [None]:
# 예측 가격값 알아보기기
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f"""The formula for the linear regression line is price = {m}*bedrooms + {b}""")

In [49]:
# pipeline으로 간편화 해보기 (transformer)
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

In [None]:
# pipeline으로 간편화 해보기 (estimator)
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)

In [51]:
# 범주형 데이터 다루기 : one-hot encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# 변환할 변수 지정 및 변환할 변수 명 생성
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid = "skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# 연속형 변수 지정
numericCols = [field for (field, dataType) in trainDF.dtypes if  ((dataType == "double") & (field != "price"))]
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [52]:
# RFormula 사용하기
from pyspark.ml.feature import RFormula

rFormula = RFormula(formula = "price ~ .", 
                    featuresCol="features",
                    labelCol="price",
                    handleInvalid="skip")

In [None]:
# 평가지표 RMSE
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="price",
    metricName='rmse'
)
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

In [None]:
# 평가지표 R^2 
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

In [55]:
from pyspark.ml.regression import DecisionTreeRegressor

# 모델 객체 생성
dt = DecisionTreeRegressor(labelCol="price")

# 숫자 열만 필터링
numericCols = [field for (field, dataType) in trainDF.dtypes
               if ((dataType == "double") & (field != "price"))]

# 위에서 정의한 StingIndexer의 출력과 숫자 열 결합
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols = assemblerInputs, outputCol="features")

# 단계를 파이프라인으로 결합
stages = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages = stages)

# 및 훈련 -> error
dt.setMaxBins(40)
pipelineModel = pipeline.fit(trainDF)

In [None]:
# 규칙 프린트
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

In [None]:
# 변수 중요도 출력
import pandas as pd

featureImp = pd.DataFrame(
    list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
    columns = ["feature", "importance"]
)
featureImp.sort_values(by = "importance", ascending=False)

In [None]:
# Random Forest
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", maxBins=40, seed = 42)

# 다시 파이프라인 구축해서 중요한 피처 출력
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])
pipelineModel = pipeline.fit(trainDF)

rfModel = pipelineModel.stages[-1]
featureImp = pd.DataFrame(
    list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)),
    columns = ["feature", "importance"]
)
featureImp.sort_values(by = "importance", ascending=False)

In [59]:
# 평가할 추정기 정의
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

In [60]:
# ParamGridBuilder 사용, 변경하려는 하이퍼파라미터 값을 지정
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder().addGrid(rf.maxDepth, [2, 4, 6]).addGrid(rf.numTrees, [10, 100]).build())

In [61]:
# 평가기를 정의, 다양한 모델을 비교하는 데 사용할 metric 지정
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

In [62]:
# CrossValidator를 사용해 다양한 모델 각각을 평가하는 교차 검증 수행
from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3, seed=32)
cvModel = cv.fit(trainDF)

In [None]:
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

In [64]:
spark.stop()