In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("241212_01_MLlib_classification").getOrCreate()

24/12/12 10:59:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# data load

In [2]:
df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/titanic.csv')

                                                                                

In [3]:
df.show(10)

+--------+------+------+----+-----+-----+-------+-----------+
|survived|pclass|   sex| age|sibsp|parch|   fare|embark_town|
+--------+------+------+----+-----+-----+-------+-----------+
|       0|     3|  male|22.0|    1|    0|   7.25|Southampton|
|       1|     1|female|38.0|    1|    0|71.2833|  Cherbourg|
|       1|     3|female|26.0|    0|    0|  7.925|Southampton|
|       1|     1|female|35.0|    1|    0|   53.1|Southampton|
|       0|     3|  male|35.0|    0|    0|   8.05|Southampton|
|       0|     3|  male|null|    0|    0| 8.4583| Queenstown|
|       0|     1|  male|54.0|    0|    0|51.8625|Southampton|
|       0|     3|  male| 2.0|    3|    1| 21.075|Southampton|
|       1|     3|female|27.0|    0|    2|11.1333|Southampton|
|       1|     2|female|14.0|    1|    0|30.0708|  Cherbourg|
+--------+------+------+----+-----+-----+-------+-----------+
only showing top 10 rows



# missing value

In [5]:
from pyspark.sql.functions import col, sum, isnan, when

null_counts = df.select(
    [
        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)  for c in df.columns
    ]
)
null_counts.show()

+--------+------+---+---+-----+-----+----+-----------+
|survived|pclass|sex|age|sibsp|parch|fare|embark_town|
+--------+------+---+---+-----+-----+----+-----------+
|       0|     0|  0|177|    0|    0|   0|          2|
+--------+------+---+---+-----+-----+----+-----------+



# feature selection

In [7]:
# feature - ?? , target - survived
data = df.select("survived", "pclass", "sex", "age", "sibsp", "parch", "fare")
data.show()

+--------+------+------+----+-----+-----+-------+
|survived|pclass|   sex| age|sibsp|parch|   fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|null|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
|       1|     3|female| 4.0|    1|    1|   16.7|
|       1|     1|female|58.0|    0|    0|  26.55|
|       0|     3|  male|20.0|    0|    0|   8.05|
|       0|     3|  male|39.0|    1|    5| 31.275|
|       0|     3|female|14.0|    0|    0| 7.8542|
|       1|     2|female|55.0|    0|    0|   16.0|
|       0|     3|  male| 2.0|    4|    1| 29.125|


In [10]:
# 평균값으로 대체(정규분포라 가정)
from pyspark.sql.functions import round, avg
mean_age = data.select(round(avg("age"), 1)).collect()[0][0]

mean_age

29.7

In [11]:
data = data.fillna( { "age" : mean_age })
data.show(10)

+--------+------+------+----+-----+-----+-------+
|survived|pclass|   sex| age|sibsp|parch|   fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|29.7|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
+--------+------+------+----+-----+-----+-------+
only showing top 10 rows



# encoding
- category type > numeric

In [12]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [13]:
indexer = StringIndexer(inputCol="sex", outputCol="SexIndex")
data = indexer.fit(data).transform(data)
data.show(5)

+--------+------+------+----+-----+-----+-------+--------+
|survived|pclass|   sex| age|sibsp|parch|   fare|SexIndex|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|     0.0|
|       1|     1|female|38.0|    1|    0|71.2833|     1.0|
|       1|     3|female|26.0|    0|    0|  7.925|     1.0|
|       1|     1|female|35.0|    1|    0|   53.1|     1.0|
|       0|     3|  male|35.0|    0|    0|   8.05|     0.0|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



# feature vector

In [14]:
#target 을 제외한 학습을 위한 피처를 "features" 라는 이름으로 어셈블한다.
assembler = VectorAssembler(
    inputCols=["pclass", "SexIndex", "age", "sibsp", "parch", "fare"],
    outputCol="features"
)
data = assembler.transform(data)
data

DataFrame[survived: int, pclass: int, sex: string, age: double, sibsp: int, parch: int, fare: double, SexIndex: double, features: vector]

In [15]:
#지도학습, 분류모델을 학습시키기 위한 데이터
data.select("survived", "features").show(5)

+--------+--------------------+
|survived|            features|
+--------+--------------------+
|       0|[3.0,0.0,22.0,1.0...|
|       1|[1.0,1.0,38.0,1.0...|
|       1|[3.0,1.0,26.0,0.0...|
|       1|[1.0,1.0,35.0,1.0...|
|       0|[3.0,0.0,35.0,0.0...|
+--------+--------------------+
only showing top 5 rows



# ML 모델 : 데이터 학습 > 평가 > 모델 완성

## dataset 분할

In [16]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed = 42 )  #리턴한값목록1 , 목록2

In [17]:
train_data.show(5), test_data.show(5)

+--------+------+------+----+-----+-----+------+--------+--------------------+
|survived|pclass|   sex| age|sibsp|parch|  fare|SexIndex|            features|
+--------+------+------+----+-----+-----+------+--------+--------------------+
|       0|     1|female| 2.0|    1|    2|151.55|     1.0|[1.0,1.0,2.0,1.0,...|
|       0|     1|female|25.0|    1|    2|151.55|     1.0|[1.0,1.0,25.0,1.0...|
|       0|     1|  male|18.0|    1|    0| 108.9|     0.0|[1.0,0.0,18.0,1.0...|
|       0|     1|  male|19.0|    1|    0|  53.1|     0.0|[1.0,0.0,19.0,1.0...|
|       0|     1|  male|19.0|    3|    2| 263.0|     0.0|[1.0,0.0,19.0,3.0...|
+--------+------+------+----+-----+-----+------+--------+--------------------+
only showing top 5 rows

+--------+------+------+----+-----+-----+-------+--------+--------------------+
|survived|pclass|   sex| age|sibsp|parch|   fare|SexIndex|            features|
+--------+------+------+----+-----+-----+-------+--------+--------------------+
|       0|     1|female|

(None, None)

## 분류 예측 model 생성 - 로지스틱 회귀
* 분류모델 : 라벨(타겟), 피처

In [20]:
from pyspark.ml.classification import LogisticRegression

In [23]:
# 로지스틱 회귀 모델 생성
lr = LogisticRegression(featuresCol="features", labelCol="survived")

In [25]:
# 모델 학습
lr_model = lr.fit( train_data ) # 80% data > y값(라벨, 타겟)을 결정할 수 있도록 매개변수를 확정

In [28]:
# 모델 시험 : 테스트 데이터 > 결과 확인 - 데이터의 답이 들어있다. > 모델이 맞춘 y값 | 지도학습
# 지도학습 : 시간과 비용이 많이 듦.
predictions = lr_model.transform( test_data )
predictions.show(5)

+--------+------+------+----+-----+-----+-------+--------+--------------------+--------------------+--------------------+----------+
|survived|pclass|   sex| age|sibsp|parch|   fare|SexIndex|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+-------+--------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|50.0|    0|    0|28.7125|     1.0|[1.0,1.0,50.0,0.0...|[-1.9520304523020...|[0.12433212676219...|       1.0|
|       0|     1|  male|21.0|    0|    1|77.2875|     0.0|[1.0,0.0,21.0,0.0...|[-0.5063720911814...|[0.37604437636883...|       1.0|
|       0|     1|  male|24.0|    0|    0|   79.2|     0.0|[1.0,0.0,24.0,0.0...|[-0.5000225475885...|[0.37753537004575...|       1.0|
|       0|     1|  male|29.0|    0|    0|   30.0|     0.0|[1.0,0.0,29.0,0.0...|[-0.1615683964925...|[0.45969553955569...|       1.0|
|       0|     1|  male|29.7|    0|    0|  26.55|     0.0|[1.0,0.0,29

In [29]:
predictions.select("features", "survived", "prediction").tail(5)

[Row(features=DenseVector([3.0, 0.0, 29.7, 0.0, 0.0, 56.4958]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 29.7, 2.0, 0.0, 23.25]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 31.0, 0.0, 0.0, 7.925]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 32.0, 0.0, 0.0, 56.4958]), survived=1, prediction=0.0),
 Row(features=DenseVector([3.0, 0.0, 39.0, 0.0, 0.0, 7.925]), survived=1, prediction=0.0)]

## 정답갯수 확인

In [31]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import expr

In [32]:
predictions = predictions.withColumn( "survived", col("survived").cast(IntegerType()))
predictions = predictions.withColumn( "prediction", col("prediction").cast(IntegerType()))

In [33]:
# correct column 추가
compare = predictions.withColumn( "correct", expr("case when survived=prediction then 1 else 0 end") )
compare.where('correct=0').count() #틀린 갯수 확인

28

## 정확도 계산 | 모델의 점수

In [34]:
accu = compare.selectExpr("avg(correct) as accuracy").collect()[0]['accuracy']
accu

0.8068965517241379

## 평가기 evaluator

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [36]:
# 모델별로 평가지표, 메트릭
# 분류모델의 평가지표 중에서 AUC 계산한다.

evaluator = BinaryClassificationEvaluator(labelCol="survived", rawPredictionCol='rawPrediction', metricName="areaUnderROC")
evaluator

BinaryClassificationEvaluator_a74462ca859f

In [37]:
auc = evaluator.evaluate(predictions)
auc

0.8664129586260734

## 모델 설명

* 위 모델은 가지고 있는 값을 그대로(문자를 숫자로만 바꾸어) 제작함.
* 피처값들의 스케일을 고려하면 모델의 성능이 향상될 것임.
* 데이터의 특성을 반영한 모델을 생성하는 것이 데이터분석가의 역할  
  → 분석가들이 도메인을 잘 알아야 하는 이유

## pyspark의 mllib 설명
- 분산 객체(데이터)를 기반으로 학습하고 예측하는 모델.
- 분산되어있다는 전제 하에서 출발

In [38]:
spark.stop()