

# 기계학습

ML에 사용할 수 있도록 변환한 데이터들을 다양한 모델을 활용하여 지도학습을 통해 예측을 수행해보자
해당 Jupyter Notebook에서는 다음 내용들을 다룬다.

1. 서포트 벡터머신(Support Vector Machine)
2. 로지스틱 회귀(Logistic Regression)
3. 네이브 베이시안(Naive Bayes)
4. 의사결정 트리(Decision Tree)
<br>
<hr>

In [1]:
import os, sys
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

23/12/19 04:43:55 WARN Utils: Your hostname, sojaehwiui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 172.30.1.29 instead (on interface en0)
23/12/19 04:43:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/19 04:43:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import os
fsvm=os.path.join(os.getcwd(),'data','sample_libsvm_data.txt')
dfsvm = spark.read.format("libsvm").load(fsvm)

23/12/19 04:43:56 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [3]:
dfsvm.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [4]:
dfsvm.take(1) #Label과 Features로 구성되어 있다. Label + Features(Dictionary)

[Row(label=0.0, features=SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0

### Label to String

In [5]:
from pyspark.sql.types import StringType, IntegerType
_df=dfsvm.withColumn('labelStr', dfsvm.label.cast(IntegerType()).cast(StringType()))
_df.show(5)

+-----+--------------------+--------+
|label|            features|labelStr|
+-----+--------------------+--------+
|  0.0|(692,[127,128,129...|       0|
|  1.0|(692,[158,159,160...|       1|
|  1.0|(692,[124,125,126...|       1|
|  1.0|(692,[152,153,154...|       1|
|  1.0|(692,[151,152,153...|       1|
+-----+--------------------+--------+
only showing top 5 rows



### MLUtils를 이용하여 RDD 읽기

In [6]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(spark.sparkContext, fsvm)
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

label.take(5)

                                                                                

[0.0, 1.0, 1.0, 1.0, 1.0]

In [7]:
features.take(1)

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

[SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0, 356: 253.0, 357: 252.0

### Train, Test Set 분할하기

In [8]:
train, test = dfsvm.randomSplit([0.6,0.4])
dfsvm.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   43|
|  1.0|   57|
+-----+-----+



## 모델 적용하기

### 1. Support Vector Machine

In [9]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1) #모델 파라미터 설정

In [10]:
lsvcModel = lsvc.fit(train) #훈련데이터에 대해서 훈련

23/12/19 04:44:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/19 04:44:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [11]:
testDf = lsvcModel.transform(test) #테스트셋에 대해서 검증

In [12]:
testDf.select('label', 'prediction').show(100)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+



### 2. 로지스틱 회귀
이진 분류를 수행해보자

In [13]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train)
testDf = lrModel.transform(test)
testDf.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
testDf.select('label','rawPrediction','probability','prediction').show(100)

+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[0.97401772358881...|[0.72591959058934...|       0.0|
|  0.0|[0.60872904769431...|[0.64765082619373...|       0.0|
|  0.0|[0.99306355311509...|[0.72969260659674...|       0.0|
|  0.0|[1.00268802621131...|[0.73158674830029...|       0.0|
|  0.0|[0.91463076375201...|[0.71394682481350...|       0.0|
|  0.0|[0.99978333499413...|[0.73101597757179...|       0.0|
|  0.0|[0.81270585248836...|[0.69268580514083...|       0.0|
|  0.0|[0.99119625838965...|[0.72932414101630...|       0.0|
|  0.0|[0.94748028228275...|[0.72060816021990...|       0.0|
|  0.0|[0.95876875042810...|[0.72287522073720...|       0.0|
|  0.0|[0.82460695680365...|[0.69521339313490...|       0.0|
|  0.0|[0.95562130296550...|[0.72224426108532...|       0.0|
|  0.0|[0.96053938098069...|[0.72322978518508...|       0.0|
|  0.0|[0.87098315358903

### 3. Naive Bayes
조건부 베이지안 확률에 따라 분류하는 모델

In [15]:
from pyspark.ml.classification import NaiveBayes

nb=NaiveBayes(featuresCol='features', labelCol='label', modelType='multinomial', predictionCol='prediction')
#nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(train)

predictions=model.transform(test)

predictions.select('label', 'prediction').show(100)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+

Test set accuracy = 1.0


### 4. Decision Tree
조건에 따라 분기하는 의사결정 모델

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dfsvm = spark.read.format("libsvm").load("data/sample_libsvm_data.txt")

labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfsvm)

# maxCategories > 4보다 크면 연속값
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfsvm)

23/12/19 04:44:06 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [17]:
(train, test) = dfsvm.randomSplit([0.7, 0.3])

dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

model = pipeline.fit(train)

predictions = model.transform(test)

predictions.select("prediction", "indexedLabel", "features").show(100)

evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[2]

print(treeModel)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(692,[100,101,102...|
|       1.0|         1.0|(692,[121,122,123...|
|       1.0|         1.0|(692,[122,123,124...|
|       1.0|         1.0|(692,[123,124,125...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[124,125,126...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[127,128,129...|
|       1.0|         1.0|(692,[128,129,130...|
|       1.0|         1.0|(692,[129,130,131...|
|       1.0|         1.0|(692,[152,153,154...|
|       0.0|         0.0|(692,[97,98,99,12...|
|       0.0|         0.0|(692,[119,120,121...|
|       0.0|         0.0|(692,[123,124,125...|
|       0.0| 