In [15]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [16]:
spark = (
    SparkSession.builder
    .appName("Curso de PySpark")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .getOrCreate()
)

In [17]:
spark

In [18]:
df = spark.read.parquet("C:/Users/marlos.barros/Cursos/pyspark_na_pratica/DATASETS/LIVROS.parquet").limit(5)

In [19]:
df

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


### Combina dois DataFrames ou duas tabelas
- Função: **union()** Cria um novo DataFrame **dup** unindo o DataFrame **df** consigo mesmo, duplicando as linhas.

In [20]:
dup = df.union(df)
dup

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


### Remover Duplicatas
- Função: **dropDuplicates()** Remove todas as linhas duplicadas do DataFrame resultante.

In [35]:
dup.dropDuplicates()

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


### Remover Duplicatas em Colunas Específicas
- Função: Remove duplicatas do DataFrame **dup**, mas apenas em relação às colunas **"numero_paginas"** e **"data_lancamento"**. As linhas são consideradas duplicatas se os valores dessas colunas forem iguais.

In [43]:
dup.dropDuplicates(["numero_paginas", "data_lancamento"])

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95


### Seleção de Colunas para Embaralhamento
- Função: Cria um novo DataFrame **df_embaralhado** contendo apenas as colunas especificadas: **"cnpj_editora"**, **"data_lancamento"**, **"id"**, **"ean"**, **"isbn10"**, **"numero_paginas"** e **"preco"** em ordem aleatória.

In [44]:
df_embaralhado = df.select("cnpj_editora", "data_lancamento", "id", "ean", "isbn10", "numero_paginas", "preco")

### Union por Nome de Colunas
- Função: Faz um **union** entre **df** e **df_embaralhado**, combinando as linhas dos dois DataFrames. Como o **unionByName()** é usado, ele combina as colunas com base em seus nomes, em vez da posição.

In [45]:
df.unionByName(df_embaralhado)

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


### Union com Coluna Faltante Permitida
- Função: Primeiro, cria um novo DataFrame **df_2** a partir de **df_embaralhado**, adicionando uma nova coluna **"quantidade"** com o valor constante **1**. Em seguida, faz um **unionByName()** entre **df_2** e **df_embaralhado**, permitindo que colunas ausentes em um dos DataFrames sejam tratadas com o parâmetro **allowMissingColumns=True**. Colunas faltantes em qualquer DataFrame serão preenchidas com valores nulos.

In [33]:
df_2 = df_embaralhado.withColumn("quantidade", F.lit(1))
df_2.unionByName(df_embaralhado, allowMissingColumns=True)

cnpj_editora,data_lancamento,id,ean,isbn10,numero_paginas,preco,quantidade
54.317.982/0001-79,2011-09-19,102961160,4367115749184,0-06-594558-1,516,113.84,1.0
16.938.042/0001-08,2019-05-11,43636240,1239569754256,0-611-23680-X,371,207.67,1.0
57.624.038/0001-52,2003-08-03,52928059,5462162528221,1-109-29542-1,198,107.89,1.0
16.249.378/0001-63,2022-09-07,88482271,5713629047213,1-229-68842-0,632,143.95,1.0
90.425.178/0001-77,2011-07-25,28125446,2226757040245,0-657-66391-3,384,156.49,1.0
54.317.982/0001-79,2011-09-19,102961160,4367115749184,0-06-594558-1,516,113.84,
16.938.042/0001-08,2019-05-11,43636240,1239569754256,0-611-23680-X,371,207.67,
57.624.038/0001-52,2003-08-03,52928059,5462162528221,1-109-29542-1,198,107.89,
16.249.378/0001-63,2022-09-07,88482271,5713629047213,1-229-68842-0,632,143.95,
90.425.178/0001-77,2011-07-25,28125446,2226757040245,0-657-66391-3,384,156.49,
