In [79]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [80]:
spark = (
    SparkSession.builder
    .appName("Curso de PySpark")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .getOrCreate()
)

In [81]:
spark

In [82]:
path = "C:/Users/marlos.barros/Cursos/pyspark_na_pratica/DATASETS/COMPRAS.parquet"

In [83]:
df = spark.read.format("parquet").load(path)

In [84]:
df

id,cartao_data_expiracao,cartao_numero,cartao_bandeira,cartao_cvc,codigo_transacao_bancaria,data,hora,ipv4,ipv6,cep_entrega,cd_livro,cd_cliente
12389,11/25,5500804500517692,Discover,959,GB98MPIH622108593...,2021-07-24,03:21:28,62.145.31.164,b23d:58a2:9eff:36...,36629-219,30334762,3339828
12476,01/31,4609489235873,VISA 16 digit,6979,GB79GAVL233010819...,2021-08-23,11:15:52,185.150.224.52,1e2c:f641:49d1:5a...,17012-747,13721981,7624624
12478,07/23,30072722359174,American Express,689,GB51SZOU538484531...,2020-01-27,17:08:10,40.179.153.24,9bcd:ee0c:af68:33...,25921298,19458805,8703114
12534,09/29,4561935154572,VISA 16 digit,353,GB05UNEX021466511...,2021-07-15,07:39:25,86.20.51.194,be57:8f94:132:1cb...,73799370,14347542,7799936
12549,01/31,3541220668415122,JCB 15 digit,967,GB97RCCC581942620...,2020-07-22,23:56:14,55.189.220.65,a1fd:9dd:115f:9d5...,50727-454,20215846,6703678
12574,06/24,4155061214506542,JCB 16 digit,362,GB93YATI197164292...,2020-05-19,10:47:09,212.26.253.42,1815:dc0e:c557:96...,67662-182,10325500,6273720
12579,01/27,30554213514227,Maestro,9847,GB54PQLN799705859...,2021-04-15,10:36:56,168.197.230.167,7f3b:4ada:4c62:3f...,61193-110,53479015,6977964
12648,05/24,347386847428278,VISA 16 digit,823,GB90MYZL510193283...,2021-06-19,13:07:48,146.108.131.11,c52d:53d8:371a:34...,10059-723,12331534,6616715
12675,04/29,4534805013764,Mastercard,384,GB11XMUH824876351...,2021-07-11,07:55:08,148.155.72.234,b290:d2b7:6a58:86...,79292884,35940339,649001
12698,12/31,4450657280386776,American Express,247,GB79ECRO017868447...,2021-08-11,12:06:48,190.135.5.172,7cbc:4554:7f5d:65...,93609312,58320651,4523531


### Pivot Simples com Contagem
- Função: A operação de **pivot()** transforma os valores únicos de mes em novas colunas.

In [85]:
(
    df
    .withColumn("mes", F.date_format(F.col("data"), "MMMM"))
    .groupBy(F.col("cartao_bandeira"))
    .pivot("mes")
    .agg(F.count("*"))
)

cartao_bandeira,April,August,December,February,January,July,June,March,May,November,October,September
VISA 16 digit,611,632,609,772,878,656,620,791,609,616,685,574
VISA 13 digit,299,335,292,350,420,324,309,406,314,302,286,308
Discover,277,339,319,406,454,329,297,386,326,301,306,310
Diners Club / Car...,300,304,322,395,397,295,331,367,291,332,328,307
American Express,324,303,332,353,385,287,291,376,346,310,309,313
Maestro,319,302,318,374,424,281,324,378,324,297,331,276
Mastercard,295,289,325,359,415,307,337,395,310,298,337,294
JCB 16 digit,633,618,620,756,835,641,596,708,618,623,615,604
VISA 19 digit,315,328,289,388,437,352,315,385,331,287,324,325
JCB 15 digit,311,272,306,374,386,342,310,401,337,269,332,302


### Pivot Específico para Meses de Janeiro e Fevereiro
- Função: Semelhante ao exemplo anterior, mas aqui o **pivot** é aplicado apenas para os meses de **January** e **February**, especificados como uma lista.
- Apenas esses meses serão transformados em colunas no DataFrame final.

In [86]:
# pivot
(
    df
    .withColumn("mes", F.date_format(F.col("data"), "MMMM"))
    .groupBy(F.col("cartao_bandeira"))
    .pivot("mes", ["January", "February"])
    .agg(F.count("*"))
)

cartao_bandeira,January,February
VISA 16 digit,878,772
VISA 13 digit,420,350
Discover,454,406
Diners Club / Car...,397,395
American Express,385,353
Maestro,424,374
Mastercard,415,359
JCB 16 digit,835,756
VISA 19 digit,437,388
JCB 15 digit,386,374


### Salvando o Resultado do Pivot
O mesmo processo do exemplo 2 é aplicado, mas o resultado é armazenado em um novo DataFrame **df_2** para uso posterior.

In [87]:
df_2 = (
    df
    .withColumn("mes", F.date_format(F.col("data"), "MMMM"))
    .groupBy(F.col("cartao_bandeira"))
    .pivot("mes", ["January", "February"])
    .agg(F.count("*"))
)

### Unpivot (Conversão para Formato Longo)
- Função: O **stack()** é utilizado para **"desempilhar"** as colunas **January** e **February**, transformando-as em linhas com duas novas colunas: **mes (nomes dos meses)** e **valor (valores contidos nas colunas January e February)**.
- Neste exemplo, as colunas **January** e **February** são convertidas em duas linhas separadas com seus respectivos valores de contagem.

In [88]:
# unpivot
df_2.select("cartao_bandeira", F.expr('stack(2, "Jan", January, "Fev", February) as (mes, valor)'))

cartao_bandeira,mes,valor
VISA 16 digit,Jan,878
VISA 16 digit,Fev,772
VISA 13 digit,Jan,420
VISA 13 digit,Fev,350
Discover,Jan,454
Discover,Fev,406
Diners Club / Car...,Jan,397
Diners Club / Car...,Fev,395
American Express,Jan,385
American Express,Fev,353
