In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=63878ae54c3cddf457c5f3b0423d0171231c3e1a357bc0b9a164268e45eccf8e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


# PySpark - Modelo de Classificação


In [4]:
# libs
from pyspark.sql import SparkSession

from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Criando a Session PySpark

In [5]:
spark = SparkSession.builder.appName('ryspark_modelo_classificacao')\
        .config('spark.master', 'local')\
        .config('spark.executor.memory', '2gb')\
        .config('spark.shuffle.sql.partitions', 2)\
        .getOrCreate()

## Importação da base

In [17]:
churn = spark.read.csv(
    '/content/Churn.csv',
    inferSchema = True,
    header = True,
    sep = ';')

In [18]:
# visualizando os dados
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

## Criando o objeto RFormula

In [19]:
formula = RFormula(
    formula = "Exited ~ .",
    featuresCol = 'features', # nome da coluna dos dados de entrada
    labelCol = 'label', # nome da coluna do dado de saida
    handleInvalid = 'skip'
)

## Treinando e Transformando a base

In [20]:
churn_transform = formula.fit(churn).transform(churn).select('features', 'label')

In [21]:
churn_transform.show(truncate = False)

+----------------------------------------------------------------+-----+
|features                                                        |label|
+----------------------------------------------------------------+-----+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]        |1.0  |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]  |0.0  |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]  |1.0  |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])          |0.0  |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0]   |0.0  |
|[645.0,0.0,0.0,1.0,44.0,8.0,1.1375578E7,2.0,1.0,0.0,1.4975671E7]|1.0  |
|[822.0,1.0,0.0,1.0,50.0,7.0,0.0,2.0,1.0,1.0,100628.0]           |0.0  |
|[376.0,0.0,1.0,0.0,29.0,4.0,1.1504674E7,4.0,1.0,0.0,1.1934688E7]|1.0  |
|[501.0,1.0,0.0,1.0,44.0,4.0,1.4205107E7,2.0,0.0,1.0,749405.0]   |0.0  |
|[684.0,1.0,0.0,1.0,27.0,2.0,1.3460388E7,1.0,1.0,1.0,7172573.0]  |0.0  |
|[528.0,1.0,0.0,1.0,31.0,6.0,1.0201672E7,2.0,0.0,0.

## Dividindo os dados em Treino e Teste
- Treino (70%)
- Teste (30%)

In [23]:
churn_treino, churn_teste = churn_transform.randomSplit([0.7, 0.3])

In [24]:
print(churn_treino.count())
print(churn_teste.count())

7015
2985


## Criando o objeto do modelo DecisionTreeClassifier

In [25]:
decision_tree = DecisionTreeClassifier(
    labelCol = 'label',
    featuresCol = 'features'
)

## Treinando o modelo DecisionTreeClassifier

In [26]:
modelo = decision_tree.fit(churn_treino)

## Criando a previsão
- sempre com os dados de testes

In [27]:
previsao = modelo.transform(churn_teste)

### Entendimento da previsão
- probability [valor1, valor2]
    - retorna uma lista contendo dois valores
        - 1º valor: probabilidade de ser 0
        - 2º valor: probabilidade de ser 1

- label
    - dados reais da nossa base

- prediction
    - retorna a previsão 0 ou 1 com base na variável 'probability'



In [30]:
previsao.show(20, truncate = False)

+---------------------------------------------------------+-----+--------------+----------------------------------------+----------+
|features                                                 |label|rawPrediction |probability                             |prediction|
+---------------------------------------------------------+-----+--------------+----------------------------------------+----------+
|(11,[0,1,3,4,7,10],[794.0,1.0,1.0,33.0,2.0,1.7812271E7]) |0.0  |[4363.0,516.0]|[0.8942406230784997,0.10575937692150031]|0.0       |
|(11,[0,1,4,5,7,10],[411.0,1.0,36.0,10.0,1.0,1.2069435E7])|0.0  |[4363.0,516.0]|[0.8942406230784997,0.10575937692150031]|0.0       |
|(11,[0,1,4,5,7,10],[474.0,1.0,30.0,9.0,2.0,6315822.0])   |0.0  |[4363.0,516.0]|[0.8942406230784997,0.10575937692150031]|0.0       |
|(11,[0,1,4,5,7,10],[499.0,1.0,57.0,1.0,1.0,1.3137238E7]) |1.0  |[33.0,225.0]  |[0.12790697674418605,0.872093023255814] |1.0       |
|(11,[0,1,4,5,7,10],[515.0,1.0,28.0,9.0,2.0,9414175.0])   |0.0  |[436

## Avaliando a performance do modelo

#### Criando o objeto de avaliação BinaryClassificationEvaluator

In [31]:
avaliar = BinaryClassificationEvaluator(
    rawPredictionCol = 'prediction',
    labelCol = 'label',
    metricName = 'areaUnderROC'
)

In [32]:
area_under_roc = avaliar.evaluate(previsao)

In [33]:
print(area_under_roc)

0.6706302794473913
