# 사이킷런 모델 생성

In [6]:
# !pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.5/34.5 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0


In [7]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [8]:
# iris datasets 로딩
iris = load_iris()

iris_data  = iris.data # feature
iris_label = iris.target # label

iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['target'] = iris_label
iris_pdf

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [9]:
iris_pdf.to_csv("./data/iris.csv",index=False)

In [10]:
# 데이터 분할 및 모델 생성
# from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier # Estimator
from sklearn.model_selection import train_test_split # RandomSpliter

X_train, X_test, t_train, t_test = train_test_split(
    iris_data,
    iris_label,
    test_size=0.2,
    random_state=42
)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, t_train) # 훈련! tree_clf 모델 자체에서 훈련이 일어나게 된다.

pred = tree_clf.predict(X_test)
print(pred)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


# Spark ML 사용하기

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("tree-clf").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/13 04:17:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
iris_filepath = "/home/ubuntu/working/spark-examples/data/iris.csv"
iris_sdf = spark.read.csv(f"file://{iris_filepath}",inferSchema=True, header= True)
iris_sdf.show(5)


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|     0|
|         4.9|        3.0|         1.4|        0.2|     0|
|         4.7|        3.2|         1.3|        0.2|     0|
|         4.6|        3.1|         1.5|        0.2|     0|
|         5.0|        3.6|         1.4|        0.2|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows



In [13]:
# randomSplit 메소드를 활용해 훈련 / 테스트 데이터 세트 분할
train_sdf, test_sdf = iris_sdf.randomSplit([0.8,0.2],seed =42)

In [14]:
# 훈련 데이터 세트는 어떻게 변환이 되어도 하나만 존재하는게 좋다!
# -> 모델을 여러 개 사용해서 변환이 되는 상황
# 훈련 데이터가 모델에 들어감녀 transform이 일어나게 된다.
# 여러 번의 훈련을 거치게 되면 transform이 여러 번 일어나게 된다.
#   -> train_sdsf가 메모리 내에 여러 개가 똑같은 것이 생길 수 있다.

# 훈련 직전에 사용할 데이터는 캐싱을 하는게 좋다.
# RDD의 특징 상 동일한 데이터를 반복해서 가져오는 것은 비효율
# 훈련을 할 때 훈련데이터는 똑같기 때문에 cache,persist를 통해 메모리에 저장해 놓는다.
train_sdf.cache()
train_sdf.show(5)
# 로우는 각 컬럼 데이터가 떨어져 있는 상황 : 벡터 어셈블러가 필요

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         4.3|        3.0|         1.1|        0.1|     0|
|         4.4|        2.9|         1.4|        0.2|     0|
|         4.4|        3.2|         1.3|        0.2|     0|
|         4.5|        2.3|         1.3|        0.3|     0|
|         4.6|        3.1|         1.5|        0.2|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows



`VectorAssembler`를 이용하여 모든 feature 컬럼을 하나의 feature vector로 만든다.(행 벡터)

In [17]:
from pyspark.ml.feature import VectorAssembler

# 합쳐질 컬럼 목록
iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# VectorAssembler로 데이터프레임에 있는 데이터를 하나의 행벡터로 합쳐준다.

# inputCols : 합쳐질 컬럼들의 목록
# outputCol : 합쳐진 컬럼의 이름
vec_assembler = VectorAssembler(inputCols=iris_columns,outputCol="features") # 별도로 어셈블러할 데이터프레임을 작성하지않아도 됨.

# VectorAssembler Transform
train_feature_vector_sdf = vec_assembler.transform(train_sdf)
train_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.3|        3.0|         1.1|        0.1|     0|[4.3,3.0,1.1,0.1]|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|
|         4.4|        3.2|         1.3|        0.2|     0|[4.4,3.2,1.3,0.2]|
|         4.5|        2.3|         1.3|        0.3|     0|[4.5,2.3,1.3,0.3]|
|         4.6|        3.1|         1.5|        0.2|     0|[4.6,3.1,1.5,0.2]|
+------------+-----------+------------+-----------+------+-----------------+
only showing top 5 rows



# Estimator
Spark ML의 모델은 추정기(Estimator)지만, 데이터를 변환시키는 Transformer에 해당한다.
- train 데이터를 받아서 예측 값(prediction)으로 변환시키는 transform 과정이 일어나기 때문!

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier

# 모델 생성. 어떤 컬럼의 데이터를 이용해서 학습할지 결정을 지어줘야 한다.
# 데이터프레임 기반이기 때문에 컬럼 정의가 필수!
dt = DecisionTreeClassifier(
    featuresCol="features",
    labelCol="target",
    maxDepth=5
)
type(dt)

pyspark.ml.classification.DecisionTreeClassifier

In [22]:
# 모델 학습. fit() 메소드를 이용하여 학습을 수행하고, 그 결과를 ML 모델로 변환한다.
dt_model = dt.fit(train_feature_vector_sdf)
type(dt_model)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f16c84d71f0>
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/envs/spark_env/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'DecisionTreeClassifier' object has no attribute '_java_obj'


pyspark.ml.classification.DecisionTreeClassificationModel

In [24]:
# 테스트 데이터 예측
test_sdf.show(5)

# 훈련 데이터에서 적용시켰던 Transformer를 테스트 세트에도 그대로 적용시킨다.
test_feature_vector_sdf = vec_assembler.transform(test_sdf)
test_feature_vector_sdf.show(5)

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         4.4|        3.0|         1.3|        0.2|     0|
|         4.6|        3.2|         1.4|        0.2|     0|
|         4.6|        3.6|         1.0|        0.2|     0|
|         4.8|        3.1|         1.6|        0.2|     0|
|         4.9|        3.1|         1.5|        0.1|     0|
+------------+-----------+------------+-----------+------+
only showing top 5 rows

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|
|         4.6|        3.6|         1.0|        0.2|     0|[

In [25]:
# 예측 : transform 사용
# 기존 값에 예측값을 덧붙인다.
predictions = dt_model.transform(test_feature_vector_sdf)
predictions.show(5)

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+------------+-----------+------------+-----------+------+------

* `rawPrediction` : 머신러닝 모델 알고리즘 별로 다를 수 있다.
    * 머신러닝 알고리즘에 의해서 계산된 값
    * 값에 대한 정확한 의미는 없다.
    * `LogisticRegression`의 경우 예측 label 별로, 예측 수행 전 `sigmoid` 함수 적용 전 값
        * $ \hat{y} = \sigma(WX + b) $
        * $ WX + b $의 결과가 `rawPrediction`
* `probability` : 예측 label 별 예측 확률 값
* `prediction` : 최종 예측 label 값

# 모델평가

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol = "target",
        predictionCol="prediction",
        metricName="accuracy"
)

accuracy = evaluator_accuracy.evaluate(predictions)
accuracy

1.0

In [35]:
# LogistciRegression 사용해보기[실습]
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
   featuresCol="features",
    labelCol="target",
    maxIter= 10)

lr_model = lr.fit(train_feature_vector_sdf)

lr_predictions = lr_model.transform(test_feature_vector_sdf)
lr_predictions.show(5)

+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[18.6086266693526...|[0.99997762791224...|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[18.8180066107263...|[0.99997581287298...|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[22.6963845270307...|[0.99999942608846...|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[16.7506644665745...|[0.99971232954776...|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[17.3393987944

In [33]:
# LogisticRegression 사용해 보기[실습]
from pyspark.ml.classification import LogisticRegression

# ML 알고리즘 객체 생성
lr = LogisticRegression(featuresCol='features', labelCol='target', maxIter=10)

lr_model = lr.fit(train_feature_vector_sdf)

predictions = lr_model.transform(test_feature_vector_sdf)
predictions.show(5)

accuracy = evaluator_accuracy.evaluate(predictions)
print("정확도", accuracy)

+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------------+--------------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[18.6086266693526...|[0.99997762791224...|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[18.8180066107263...|[0.99997581287298...|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[22.6963845270307...|[0.99999942608846...|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[16.7506644665745...|[0.99971232954776...|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[17.3393987944

# 파이프라인 구축
- pipeline은 여러 개의 개별적인 Transformer의 변환 작업, Estimator의 학습작업을 일련의 프로세스 연결을 통해 간단한 API 처리로 구현할 수 있게 해준다.

In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

iris_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

# Pipeline은 개별 변환 및 모델 학습 작업을 각각의 stage로 정의해서 파이프라인에 순서대로 등록
# pipeline.fit()메소드를 활용해서 순서대로 연결된 스테이지 작업을 일괄적으로 수행
# pipeline.fit()의 결과물은 pipelineModel로 반환이 된다.
# pipelineModel에서 예측 작업을 transform()로 수행

# 첫 번째 stage는 Feature Vectorization을 위한 VectorAssembler
stage_1 = VectorAssembler(inputCols=iris_columns, outputCol="features")

# 두 번째 stage는 학습을 위한 모델을 생성
stage_2 = DecisionTreeClassifier(featuresCol="features",labelCol="target",maxDepth=3)

# 리스트를 활용해 stage를 순서대로 배치
stages = [stage_1, stage_2]

# 파이프라인에 등록
pipeline = Pipeline(stages=stages)
type(pipeline)

pyspark.ml.pipeline.Pipeline

In [37]:
pipeline_model = pipeline.fit(train_sdf)
type(pipeline_model)

pyspark.ml.pipeline.PipelineModel

In [38]:
# 파이프라인을 통해서 테스트 예측
predictions = pipeline_model.transform(test_sdf)
predictions.show(5)

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.4|        3.0|         1.3|        0.2|     0|[4.4,3.0,1.3,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.2|         1.4|        0.2|     0|[4.6,3.2,1.4,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.6|        3.6|         1.0|        0.2|     0|[4.6,3.6,1.0,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.1|         1.6|        0.2|     0|[4.8,3.1,1.6,0.2]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+------------+-----------+------------+-----------+------+------

In [39]:
spark.stop()