In [1]:
#P14 In-Class 15481075 이재근
# 특정한 하나의 예측 방법이 아니라 복수의 예측 모형을 결합하여 더 나은 성능의 예측을 시도해보았다. 모형결합 방법을 통해 단일모형일 떄 보다 성능 분산을 줄일 수 있었던 것 같다.

In [2]:
import findspark

In [3]:
findspark.init('C:\spark-2.4.5-bin-hadoop2.7')

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('dogfood').getOrCreate()

In [6]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [7]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [8]:
data.head()

Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)

In [9]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [12]:
assembler = VectorAssembler(inputCols=['A','B','C','D'],outputCol='features')

In [13]:
output = assembler.transform(data)

In [14]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

In [15]:
rfc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features')

In [16]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
final_data = output.select('features','Spoiled')

In [18]:
final_data.head()

Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)

In [19]:
rfc_model = rfc.fit(final_data)

In [20]:
rfc_model.featureImportances

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

In [21]:
predictions = rfc_model.transform(final_data)

In [22]:
predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [23]:
predictions.select('prediction','Spoiled','features').show()

+----------+-------+-------------------+
|prediction|Spoiled|           features|
+----------+-------+-------------------+
|       1.0|    1.0| [4.0,2.0,12.0,3.0]|
|       1.0|    1.0| [5.0,6.0,12.0,7.0]|
|       1.0|    1.0| [6.0,2.0,13.0,6.0]|
|       1.0|    1.0| [4.0,2.0,12.0,1.0]|
|       1.0|    1.0| [4.0,2.0,12.0,3.0]|
|       1.0|    1.0|[10.0,3.0,13.0,9.0]|
|       1.0|    1.0| [8.0,5.0,14.0,5.0]|
|       1.0|    1.0| [5.0,8.0,12.0,8.0]|
|       1.0|    1.0| [6.0,5.0,12.0,9.0]|
|       1.0|    1.0| [3.0,3.0,12.0,1.0]|
|       1.0|    1.0| [9.0,8.0,11.0,3.0]|
|       1.0|    1.0|[1.0,10.0,12.0,3.0]|
|       1.0|    1.0|[1.0,5.0,13.0,10.0]|
|       1.0|    1.0|[2.0,10.0,12.0,6.0]|
|       1.0|    1.0|[1.0,10.0,11.0,4.0]|
|       1.0|    1.0| [5.0,3.0,12.0,2.0]|
|       1.0|    1.0| [4.0,9.0,11.0,8.0]|
|       1.0|    1.0| [5.0,1.0,11.0,1.0]|
|       1.0|    1.0|[4.0,9.0,12.0,10.0]|
|       0.0|    1.0| [5.0,8.0,10.0,9.0]|
+----------+-------+-------------------+
only showing top

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
from pyspark.ml.classification import RandomForestClassifier

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='Spoiled',predictionCol='prediction',metricName='accuracy')

In [27]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0142857


In [28]:
from pyspark.ml.classification import GBTClassifier

In [29]:
data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [30]:
trainingData, testData = data.randomSplit([0.7,0.3])

In [31]:
gbt = GBTClassifier(labelCol='label',featuresCol='features',maxIter=10)

In [32]:
rfc_model = gbt.fit(trainingData)

In [33]:
predictions = rfc_model.transform(testData)

In [34]:
predictions.select('prediction','label','features').show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[127,128,129...|
|       0.0|  0.0|(692,[129,130,131...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[152,153,154...|
|       0.0|  0.0|(692,[153,154,155...|
|       0.0|  0.0|(692,[181,182,183...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[124,125,126...|
|       1.0|  1.0|(692,[125,126,127...|
|       1.0|  1.0|(692,[125,126,153...|
|       1.0|  1.0|(692,[126,127,128...|
|       1.0|  1.0|(692,[127,128,129...|
|       1.0|  1.0|(692,[127,128,129...|
+----------+-----+--------------------+
only showing top 20 rows



In [35]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')

In [36]:
accuracy = evaluator.evaluate(predictions)

In [37]:
accuracy

1.0

In [38]:
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0
