# Importando Bibliotecas

In [75]:
import pandas as pd

from sklearn.datasets import load_iris
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [40]:
spark = SparkSession.builder.appName('iris_ml').getOrCreate()

In [53]:
schema = StructType().add('target', 'integer')\
                     .add('sepal length (cm)', 'float')\
                     .add("sepal width (cm)", "float")\
                     .add("petal length (cm)", "float")\
                     .add("petal width (cm)", "float")\

iris = pd.DataFrame(load_iris().data, load_iris().target, columns= load_iris().feature_names).reset_index().rename(columns= {'index' : 'target'})
iris_spark = spark.createDataFrame(iris, schema= schema)

In [54]:
iris_spark.printSchema()

root
 |-- target: integer (nullable = true)
 |-- sepal length (cm): float (nullable = true)
 |-- sepal width (cm): float (nullable = true)
 |-- petal length (cm): float (nullable = true)
 |-- petal width (cm): float (nullable = true)



In [42]:
#transforma as colunas em um vetor
vector = VectorAssembler(inputCols= ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 
                         outputCol= 'X')

In [69]:
decision_tree = DecisionTreeClassifier(featuresCol='X', labelCol='target', seed=0)

In [78]:
#Separando os dados em treino e teste
train, test = iris_spark.randomSplit(weights=[0.75,0.25], seed=0)

# Modelos
## Árvore de Decisão

In [86]:
decision_tree = DecisionTreeClassifier(featuresCol='X', labelCol='target', seed=0)

pipe_decision_tree = Pipeline(stages=[vector, decision_tree])
model_decision_tree = pipe_decision_tree.fit(train)
previsoes_decision_tree = model_decision_tree.transform(test)

perf = MulticlassClassificationEvaluator(labelCol= 'target', predictionCol= 'prediction', metricName= 'accuracy')
acc = perf.evaluate(previsoes_decision_tree)
print(acc)

0.8461538461538461


## Floresta Aleatória

In [87]:
random_forest = RandomForestClassifier(featuresCol='X', labelCol='target', seed=0)

pipe_random_forest = Pipeline(stages=[vector, random_forest])
model_random_forest = pipe_random_forest.fit(train)
previsoes_random_forest = model_random_forest.transform(test)

perform = MulticlassClassificationEvaluator(labelCol= 'target', predictionCol= 'prediction', metricName= 'accuracy')
acc = perform.evaluate(previsoes_random_forest)
print(acc)

0.9230769230769231


In [88]:
spark.stop()