In [1]:
from pyspark import SparkConf, SparkContext

from pyspark.sql import SparkSession

from pyspark.ml import Pipeline

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.classification import LogisticRegression

from datetime import datetime
import os

## CONSTANTS

In [2]:
TRAINING_FILE = os.getenv("TRAINING_FILE", "spam_ham_dataset.csv")
MODEL_NAME = os.getenv("MODEL_PATH", "model")
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://cm1:7077")
SPARK_APP_NAME = "Exemplo 01 - E-mail SPAM - Train"

## Startup

In [3]:
conf = SparkConf() \
    .setMaster(SPARK_MASTER) \
    .setAppName(SPARK_APP_NAME)

context = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/01 22:47:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Ingest Dataset

In [4]:
training = spark \
    .read \
    .option("sep", ",") \
    .option("header", "true") \
    .option("multiLine", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .csv(TRAINING_FILE)
training

                                                                                

DataFrame[_c0: string, label: string, text: string, label_num: string]

### Seleciona `text` e converte `label_num` para float

In [5]:
training = training.selectExpr("text", "CAST(label_num AS FLOAT) AS label")

training.show()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Subject: enron me...|  0.0|
|Subject: hpl nom ...|  0.0|
|Subject: neon ret...|  0.0|
|Subject: photosho...|  1.0|
|Subject: re : ind...|  0.0|
|Subject: ehronlin...|  0.0|
|Subject: spring s...|  0.0|
|Subject: looking ...|  1.0|
|Subject: noms / a...|  0.0|
|Subject: nominati...|  0.0|
|Subject: vocable ...|  1.0|
|Subject: report 0...|  1.0|
|Subject: enron / ...|  0.0|
|Subject: vic . od...|  1.0|
|Subject: tenaska ...|  0.0|
|Subject: underpri...|  1.0|
|Subject: re : fir...|  0.0|
|Subject: swift - ...|  0.0|
|Subject: meter va...|  0.0|
|Subject: addition...|  0.0|
+--------------------+-----+
only showing top 20 rows



                                                                                

## Arquitetura do modelo
### Extração, transformação e seleção

In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
logisticRegresion = LogisticRegression()

### Pipeline

In [7]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, logisticRegresion])

## Treinamento

In [8]:
training.cache()
model = pipeline.fit(training)

[Stage 3:>                                                          (0 + 1) / 1]

23/05/01 22:47:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/01 22:47:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/01 22:47:22 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/05/01 22:47:22 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

## Persiste o modelo

In [9]:
model.save(MODEL_NAME)

                                                                                