In [2]:
spark

In [3]:
sc

In [4]:
!hdfs dfs -ls /user/feliciani/data/iris

Found 3 items
-rw-r--r--   3 root supergroup       4702 2021-06-23 18:23 /user/feliciani/data/iris/bezdekIris.data
-rw-r--r--   3 root supergroup       4702 2021-06-23 18:23 /user/feliciani/data/iris/iris.data
-rw-r--r--   3 root supergroup       3067 2021-06-23 18:23 /user/feliciani/data/iris/iris.names


In [6]:
# arquivos bezdekIris tem 150 linhas
iris_bezdekIris = spark.read.csv("/user/feliciani/data/iris/bezdekIris.data")
iris.count()

150

In [7]:
# arquivos iris tem 150 linhas
iris_bezdekIris = spark.read.csv("/user/feliciani/data/iris/iris.data")
iris.count()

150

In [8]:
# total com os 2 arquivos bezdekIris e iris tem 300 linhas
iris = spark.read.csv("/user/feliciani/data/iris/*.data")
iris.count()

300

#### Ler os arquivos csv “hdfs://namenode:8020/user/<nome>/data/iris/*.data” em modo streaming com o seguinte schema:

##### sepal_length float
##### sepal_width float
##### petal_length float
##### petal_width float
##### class string

In [9]:
from pyspark.sql.types import StructType

In [10]:
iris_schema = StructType()\
    .add("sepal_length", "float")\
    .add("sepal_width", "float")\
    .add("petal_length", "float")\
    .add("petal_width", "float")\
    .add("class", "string")

In [11]:
print(iris_schema)

StructType(List(StructField(sepal_length,FloatType,true),StructField(sepal_width,FloatType,true),StructField(petal_length,FloatType,true),StructField(petal_width,FloatType,true),StructField(class,StringType,true)))


In [5]:
# lendo o conteúdo de todos os arquivos .data  (listando o 5 primeiros em memória)

iris = spark.read.csv("/user/feliciani/data/iris/*.data").show(5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



In [12]:
# aplicando o Schema criado na leitura dos dados dos arquivos .data

iris = spark.read.schema(iris_schema).csv("/user/feliciani/data/iris/*.data").show(10)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 10 rows



In [13]:
# A leitura em Batch está certa, tendo o Schema definido

iris = spark.read.schema(iris_schema).csv("/user/feliciani/data/iris/*.data").printSchema()

root
 |-- sepal_length: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_length: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class: string (nullable = true)



In [17]:
# salvando o Schema com o formato STREAM
iris = spark.readStream.schema(iris_schema).csv("/user/feliciani/data/iris/*.data")

#### Visualizar o schema das informações

In [18]:
iris.printSchema()

root
 |-- sepal_length: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_length: float (nullable = true)
 |-- petal_width: float (nullable = true)
 |-- class: string (nullable = true)



#### Salvar os dados no diretório “hdfs://namenode:8020/user/<nome>/stream_iris/path” e o checkpoint em “hdfs://namenode:8020/user/<nome>/stream_iris/check”

In [20]:
iris_saida = iris.writeStream.format("csv")\
    .option("checkpointLocation","/user/feliciani/stream_iris/check")\
    .option("path", "/user/feliciani/stream_iris/path" )\
    .start()

In [21]:
# id do Streaming
iris_saida.id

'56049837-9fb6-4f1f-a9d0-91eecfa7d61d'

In [24]:
# mostra o último processo
iris_saida.lastProgress

{'id': '56049837-9fb6-4f1f-a9d0-91eecfa7d61d',
 'runId': 'a3aa30b2-7342-4d25-815e-368cc683313c',
 'name': None,
 'timestamp': '2021-07-04T21:20:49.615Z',
 'batchId': 1,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 5, 'triggerExecution': 5},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://namenode:8020/user/feliciani/data/iris/*.data]',
   'startOffset': {'logOffset': 0},
   'endOffset': {'logOffset': 0},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'FileSink[/user/feliciani/stream_iris/path]'}}

In [26]:
iris_saida.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

#### Verificar a saida no hdfs e entender como os dados foram salvos

In [27]:
# contém os metadados

!hdfs dfs -ls /user/feliciani/stream_iris/check

Found 4 items
drwxr-xr-x   - root supergroup          0 2021-07-04 20:27 /user/feliciani/stream_iris/check/commits
-rw-r--r--   3 root supergroup         45 2021-07-04 20:27 /user/feliciani/stream_iris/check/metadata
drwxr-xr-x   - root supergroup          0 2021-07-04 20:27 /user/feliciani/stream_iris/check/offsets
drwxr-xr-x   - root supergroup          0 2021-07-04 20:27 /user/feliciani/stream_iris/check/sources


In [28]:
# contém os dados

!hdfs dfs -ls /user/feliciani/stream_iris/path

# para cada arquivo enviado, foi criado um novo arquivo

Found 3 items
drwxr-xr-x   - root supergroup          0 2021-07-04 20:27 /user/feliciani/stream_iris/path/_spark_metadata
-rw-r--r--   2 root supergroup       4550 2021-07-04 20:27 /user/feliciani/stream_iris/path/part-00000-fd6b9507-e8d1-4e4e-a27e-fbd293f993ab-c000.csv
-rw-r--r--   2 root supergroup       4550 2021-07-04 20:27 /user/feliciani/stream_iris/path/part-00001-2cdb8190-4f88-4cbe-ac86-aed8e609a7ac-c000.csv


In [29]:
# mesmo conteúdo do arquivo

spark.read.csv("/user/feliciani/stream_iris/path/part-00000-fd6b9507-e8d1-4e4e-a27e-fbd293f993ab-c000.csv").show(10)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
+---+---+---+---+-----------+
only showing top 10 rows



In [30]:
# mesma quantidade de linhas

spark.read.csv("/user/feliciani/stream_iris/path/part-00000-fd6b9507-e8d1-4e4e-a27e-fbd293f993ab-c000.csv").count()

150

In [31]:
# mesmo conteúdo do arquivo

spark.read.csv("/user/feliciani/stream_iris/path/part-00001-2cdb8190-4f88-4cbe-ac86-aed8e609a7ac-c000.csv").show(10)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
+---+---+---+---+-----------+
only showing top 10 rows



In [32]:
spark.read.csv("/user/feliciani/stream_iris/path/part-00001-2cdb8190-4f88-4cbe-ac86-aed8e609a7ac-c000.csv").count()

150