In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("Curso de PySpark")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .getOrCreate()
)

In [3]:
spark

In [7]:
path = "C:/Users/marlos.barros/Cursos/pyspark_na_pratica/DATASETS/LOGINS.parquet"

In [10]:
df = spark.read.format("parquet").load(path).select("data_de_nascimento","data_cadastro")

In [11]:
df

data_de_nascimento,data_cadastro
2006-12-18,2023-02-26
1992-06-17,2023-02-16
1921-11-11,2023-01-02
2021-06-01,2023-01-08
1969-10-28,2023-02-14
1986-05-19,2023-03-07
2018-04-20,2023-01-13
1996-05-12,2023-02-04
2021-10-05,2023-03-02
1917-01-05,2023-02-21


### Adicionar e Subtrair Meses
- Função: **F.add_months()** adiciona ou subtrai um número de meses da data informada.
- Exemplo: Se a data de **cadastro** for **2025-03-15**, a coluna **"add_months+"**** será **2025-04-15** e **"add_months-"** será **2025-02-15**.
- Utilização: É útil para calcular datas futuras ou passadas, como em cálculos de assinatura, vencimentos, etc.

In [72]:
(
    df
    .withColumn("add_months+", F.add_months(F.col("data_cadastro"), 1))
    #.withColumn("add_months-", F.add_months(F.col("data_cadastro"), -1))
)

data_de_nascimento,data_cadastro,add_months+,add_months-
2006-12-18,2023-02-26,2023-03-26,2023-01-26
1992-06-17,2023-02-16,2023-03-16,2023-01-16
1921-11-11,2023-01-02,2023-02-02,2022-12-02
2021-06-01,2023-01-08,2023-02-08,2022-12-08
1969-10-28,2023-02-14,2023-03-14,2023-01-14
1986-05-19,2023-03-07,2023-04-07,2023-02-07
2018-04-20,2023-01-13,2023-02-13,2022-12-13
1996-05-12,2023-02-04,2023-03-04,2023-01-04
2021-10-05,2023-03-02,2023-04-02,2023-02-02
1917-01-05,2023-02-21,2023-03-21,2023-01-21


### Data Atual e Timestamp Atual
- Função: **F.current_date()** retorna a data atual.
- Função: **F.current_timestamp()** retorna a data e hora atuais.
- Exemplo: Se hoje for **2025-03-17**, **"current_date"** será **2025-03-17** e **"current_timestamp"** será **2025-03-17 10:00:00** (horário fictício).
- Utilização: Essas funções são úteis para registrar quando uma operação foi feita ou calcular tempos de execução.

In [31]:
(
    df
    .withColumn("current_date", F.current_date())
    .withColumn("current_timestamp", F.current_timestamp()).show(10, False)
)

+------------------+-------------+------------+--------------------------+
|data_de_nascimento|data_cadastro|current_date|current_timestamp         |
+------------------+-------------+------------+--------------------------+
|2006-12-18        |2023-02-26   |2025-03-18  |2025-03-18 16:21:17.515641|
|1992-06-17        |2023-02-16   |2025-03-18  |2025-03-18 16:21:17.515641|
|1921-11-11        |2023-01-02   |2025-03-18  |2025-03-18 16:21:17.515641|
|2021-06-01        |2023-01-08   |2025-03-18  |2025-03-18 16:21:17.515641|
|1969-10-28        |2023-02-14   |2025-03-18  |2025-03-18 16:21:17.515641|
|1986-05-19        |2023-03-07   |2025-03-18  |2025-03-18 16:21:17.515641|
|2018-04-20        |2023-01-13   |2025-03-18  |2025-03-18 16:21:17.515641|
|1996-05-12        |2023-02-04   |2025-03-18  |2025-03-18 16:21:17.515641|
|2021-10-05        |2023-03-02   |2025-03-18  |2025-03-18 16:21:17.515641|
|1917-01-05        |2023-02-21   |2025-03-18  |2025-03-18 16:21:17.515641|
+------------------+-----

### Adicionar Dias e Subtrair Dias
- Função: **F.date_add()** adiciona dias na coluna **data_cadastro**.
- Exemplo: Se **data_cadastro** for **2025-03-01**, a coluna **"date_add+"** será **2025-03-16**.
    - **date_add+: '2023-01-30'**
    - **date_add-: '2022-12-31'**
- Utilização: Usado em cálculos de prazos ou tempo decorrido.

In [73]:
(
    df
    .withColumn("date_add+", F.date_add(F.col("data_cadastro"), 15)) # Adicionar 15 Dias
    #.withColumn("date_add-", F.date_add(F.col("data_cadastro"), -15)) # Subtrair 15 Dias
)

data_de_nascimento,data_cadastro,date_add+
2006-12-18,2023-02-26,2023-03-13
1992-06-17,2023-02-16,2023-03-03
1921-11-11,2023-01-02,2023-01-17
2021-06-01,2023-01-08,2023-01-23
1969-10-28,2023-02-14,2023-03-01
1986-05-19,2023-03-07,2023-03-22
2018-04-20,2023-01-13,2023-01-28
1996-05-12,2023-02-04,2023-02-19
2021-10-05,2023-03-02,2023-03-17
1917-01-05,2023-02-21,2023-03-08


### Subtrair Dias
- Função: **F.date_sub()** subtrai dias da coluna **data_cadastro**.
- Exemplo: Se **data_cadastro** for **2025-03-01** a coluna **"date_sub"** será **2025-02-14**.
- Utilização: Usado em cálculos de prazos ou tempo decorrido.

In [35]:
(
    df
    .withColumn("date_sub", F.date_sub(F.col("data_cadastro"), 15))
)

data_de_nascimento,data_cadastro,date_sub
2006-12-18,2023-02-26,2023-02-11
1992-06-17,2023-02-16,2023-02-01
1921-11-11,2023-01-02,2022-12-18
2021-06-01,2023-01-08,2022-12-24
1969-10-28,2023-02-14,2023-01-30
1986-05-19,2023-03-07,2023-02-20
2018-04-20,2023-01-13,2022-12-29
1996-05-12,2023-02-04,2023-01-20
2021-10-05,2023-03-02,2023-02-15
1917-01-05,2023-02-21,2023-02-06


### Formatação de Data
- Função: **F.date_format()** formata uma data em uma string conforme o padrão especificado.
- Exemplo: Se **data_de_nascimento** for **1993-12-09**, **"1-date_format /**" será **09/12/1993** e **"2-date_format -"** será **09-12-1993**.
- Utilização: É usado para ajustar a exibição da data em diferentes formatos.

In [107]:
(
    df
    .withColumn("1-date_format /", F.date_format(F.col("data_de_nascimento"), "dd/MM/yyyy")) # EX: 18/12/2006
    .withColumn("2-date_format -", F.date_format(F.col("data_de_nascimento"), "dd-MM-yyyy")) # EX: 18-12-2006
    .withColumn("3-date_format ''", F.date_format(F.col("data_de_nascimento"), "dd MM yyyy")) # EX: 18 12 2006
)

data_de_nascimento,data_cadastro,1-date_format /,2-date_format -,3-date_format ''
2006-12-18,2023-02-26,18/12/2006,18-12-2006,18 12 2006
1992-06-17,2023-02-16,17/06/1992,17-06-1992,17 06 1992
1921-11-11,2023-01-02,11/11/1921,11-11-1921,11 11 1921
2021-06-01,2023-01-08,01/06/2021,01-06-2021,01 06 2021
1969-10-28,2023-02-14,28/10/1969,28-10-1969,28 10 1969
1986-05-19,2023-03-07,19/05/1986,19-05-1986,19 05 1986
2018-04-20,2023-01-13,20/04/2018,20-04-2018,20 04 2018
1996-05-12,2023-02-04,12/05/1996,12-05-1996,12 05 1996
2021-10-05,2023-03-02,05/10/2021,05-10-2021,05 10 2021
1917-01-05,2023-02-21,05/01/1917,05-01-1917,05 01 1917


**Dia da Semana e Mês Completo**
- Formata a data para incluir o nome do dia da semana ou o nome completo do mês.
- Exemplo: **"4-dia_da_semana"** seria algo como **Qui**, **09-12-1993** e **"5-nome_do_mes_completo"** seria **09-dezembro-1993**.
- Utilização: Ideal para relatórios ou exibição mais legível de datas.

In [106]:
(
    df
    .withColumn("4-dia_da_semana", F.date_format(F.col("data_de_nascimento"), "E, dd-MM-yyyy")) # EX: Mon, 18-12-2006
    .withColumn("5-nome_do_mes_completo", F.date_format(F.col("data_de_nascimento"), "dd-MMMM-yyyy")) # EX: 18-December-2006
    .withColumn("6-nome_do_mes_incompleto", F.date_format(F.col("data_de_nascimento"), "dd-MMM-yyyy")) # EX: 18-Dec-2006
)

data_de_nascimento,data_cadastro,4-dia_da_semana,5-nome_do_mes_completo,6-nome_do_mes_incompleto
2006-12-18,2023-02-26,"Mon, 18-12-2006",18-December-2006,18-Dec-2006
1992-06-17,2023-02-16,"Wed, 17-06-1992",17-June-1992,17-Jun-1992
1921-11-11,2023-01-02,"Fri, 11-11-1921",11-November-1921,11-Nov-1921
2021-06-01,2023-01-08,"Tue, 01-06-2021",01-June-2021,01-Jun-2021
1969-10-28,2023-02-14,"Tue, 28-10-1969",28-October-1969,28-Oct-1969
1986-05-19,2023-03-07,"Mon, 19-05-1986",19-May-1986,19-May-1986
2018-04-20,2023-01-13,"Fri, 20-04-2018",20-April-2018,20-Apr-2018
1996-05-12,2023-02-04,"Sun, 12-05-1996",12-May-1996,12-May-1996
2021-10-05,2023-03-02,"Tue, 05-10-2021",05-October-2021,05-Oct-2021
1917-01-05,2023-02-21,"Fri, 05-01-1917",05-January-1917,05-Jan-1917


In [None]:
### 

In [71]:
(
    df
    .withColumn("datediff", F.datediff(F.current_date(), F.col("data_de_nascimento")))
)

data_de_nascimento,data_cadastro,datediff
2006-12-18,2023-02-26,6665
1992-06-17,2023-02-16,11962
1921-11-11,2023-01-02,37748
2021-06-01,2023-01-08,1386
1969-10-28,2023-02-14,20230
1986-05-19,2023-03-07,14183
2018-04-20,2023-01-13,2524
1996-05-12,2023-02-04,10537
2021-10-05,2023-03-02,1260
1917-01-05,2023-02-21,39519


### Extração de Componentes de Data
- Funções:
    - **F.dayofmonth()**: Extrai o dia do mês.
    - **F.dayofweek()**: Extrai o dia da semana (1 = Domingo, 7 = Sábado).
    - **F.weekofyear()**: Retorna a semana do ano.
    - **F.year(), F.month()**: Retorna o ano e mês.
    - **F.last_day()**: Retorna o último dia do mês.
    - **F.months_between()**: Calcula o número de meses entre duas datas.
    - **F.next_day()**: Retorna a próxima ocorrência de um dia da semana a partir da data fornecida.
    - **F.make_date()**: Cria uma nova data a partir de ano, mês e dia.
    - **F.to_date()**: Converte uma string em uma data.
---
- Exemplo: Se data_de_nascimento for **'1990-03-10'**:
    - **dayofmonth**: 10
    - **dayofweek**: 7 (Sábado)
    - **weekofyear**: 10
    - **year**: 1990
    - **month**: 3
    - **last_day**: 31 (último dia de março)
    - **months_between**: Calcula o número de meses até a data atual.
    - **next_day**: A próxima segunda-feira após a data_cadastro.
    - **make_date**: 1993-12-09.
    - **to_date**: Converte a string '2008-01-01' em uma data.

In [108]:
(
    df
    .withColumn("1-dayofmonth", F.dayofmonth(F.col("data_de_nascimento"))) # 18
    .withColumn("2-dayofweek", F.dayofweek(F.col("data_de_nascimento"))) # 2
    .withColumn("3-weekofyear", F.weekofyear(F.col("data_de_nascimento"))) # 51
    .withColumn("4-year", F.year(F.col("data_de_nascimento"))) # 2006
    .withColumn("5-month", F.month(F.col("data_de_nascimento"))) # 12
    .withColumn("6-last_day", F.last_day(F.col("data_de_nascimento"))) # 2006-12-31
    .withColumn("7-months_between", F.months_between(F.current_date(), F.col("data_de_nascimento"))) # 219.03225806
    .withColumn("8-next_day", F.next_day(F.col("data_cadastro"), "Mon")) # 2023-02-27
    .withColumn("9-make_date", F.make_date(F.lit(1993), F.lit(12), F.lit(9))) # 1993-12-09
    .withColumn("10-to_date", F.to_date(F.lit("2008-01-01"))) # 2008-01-01
)

data_de_nascimento,data_cadastro,1-dayofmonth,2-dayofweek,3-weekofyear,4-year,5-month,6-last_day,7-months_between,8-next_day,9-make_date,10-to_date
2006-12-18,2023-02-26,18,2,51,2006,12,2006-12-31,219.03225806,2023-02-27,1993-12-09,2008-01-01
1992-06-17,2023-02-16,17,4,25,1992,6,1992-06-30,393.06451613,2023-02-20,1993-12-09,2008-01-01
1921-11-11,2023-01-02,11,6,45,1921,11,1921-11-30,1240.25806452,2023-01-09,1993-12-09,2008-01-01
2021-06-01,2023-01-08,1,3,22,2021,6,2021-06-30,45.58064516,2023-01-09,1993-12-09,2008-01-01
1969-10-28,2023-02-14,28,3,44,1969,10,1969-10-31,664.70967742,2023-02-20,1993-12-09,2008-01-01
1986-05-19,2023-03-07,19,2,21,1986,5,1986-05-31,466.0,2023-03-13,1993-12-09,2008-01-01
2018-04-20,2023-01-13,20,6,16,2018,4,2018-04-30,82.96774194,2023-01-16,1993-12-09,2008-01-01
1996-05-12,2023-02-04,12,1,19,1996,5,1996-05-31,346.22580645,2023-02-06,1993-12-09,2008-01-01
2021-10-05,2023-03-02,5,3,40,2021,10,2021-10-31,41.4516129,2023-03-06,1993-12-09,2008-01-01
1917-01-05,2023-02-21,5,6,1,1917,1,1917-01-31,1298.4516129,2023-02-27,1993-12-09,2008-01-01
