In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, isnull, lit, length, explode, count, upper, lower, regexp_replace, regexp_extract

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Quality Check clientes_opt") \
    .getOrCreate()

In [5]:
df = spark.read.json("/app/data/raw/clientes_opt.json")


                                                                                

In [6]:
df.printSchema()

root
 |-- b_call: boolean (nullable = true)
 |-- b_email: boolean (nullable = true)
 |-- b_push: boolean (nullable = true)
 |-- b_sms: boolean (nullable = true)
 |-- v_id_cli: string (nullable = true)



In [7]:
df.show(5, truncate=False)

                                                                                

+------+-------+------+-----+--------------------+
|b_call|b_email|b_push|b_sms|v_id_cli            |
+------+-------+------+-----+--------------------+
|true  |true   |true  |true |010FC87179A2C7940661|
|NULL  |true   |NULL  |true |04B2DB249A9623A2371B|
|true  |true   |false |true |0248D7A8AA7D752A2AF9|
|false |false  |false |false|02ABD1DDB59E705FCE49|
|NULL  |true   |NULL  |NULL |050C08C7789F8BF18F10|
+------+-------+------+-----+--------------------+
only showing top 5 rows



In [8]:
df.count()

76623

In [9]:
df.dtypes

[('b_call', 'boolean'),
 ('b_email', 'boolean'),
 ('b_push', 'boolean'),
 ('b_sms', 'boolean'),
 ('v_id_cli', 'string')]

In [10]:
df.select([
    count(when(isnull(c), c)).alias(c) for c in df.columns
]).show()

+------+-------+------+-----+--------+
|b_call|b_email|b_push|b_sms|v_id_cli|
+------+-------+------+-----+--------+
| 43708|  18283| 41144|20099|       0|
+------+-------+------+-----+--------+



In [12]:
# Ver o comprimento dos valores
df.select(length(col("v_id_cli")).alias("len")).groupBy("len").count().show()

# Contar valores únicos
df.select("v_id_cli").distinct().count()

                                                                                

+---+-----+
|len|count|
+---+-----+
| 20|76623|
+---+-----+



                                                                                

76623

In [13]:
import re

# Exemplo para verificar se só tem caracteres hexadecimais (0-9, a-f)
df.filter(~df.v_id_cli.rlike("^[a-fA-F0-9]{20}$")).count()


0

In [14]:
from pyspark.sql.functions import col, regexp_extract

# Ver se tem letras, números, ambos
df.select(
    col("v_id_cli"),
    regexp_extract("v_id_cli", "[a-zA-Z]", 0).alias("tem_letra"),
    regexp_extract("v_id_cli", "[0-9]", 0).alias("tem_numero")
).show(10)


+--------------------+---------+----------+
|            v_id_cli|tem_letra|tem_numero|
+--------------------+---------+----------+
|010FC87179A2C7940661|        F|         0|
|04B2DB249A9623A2371B|        B|         0|
|0248D7A8AA7D752A2AF9|        D|         0|
|02ABD1DDB59E705FCE49|        A|         0|
|050C08C7789F8BF18F10|        C|         0|
|044D8241C0BA9EB8A2BD|        D|         0|
|0236AE95865254140BEC|        A|         0|
|04E8FC308B3A1CC4CC90|        E|         0|
|027B23C4BD585D29DBA9|        B|         0|
|016BC87B3EFE0230E53A|        B|         0|
+--------------------+---------+----------+
only showing top 10 rows



In [None]:
#df.write.mode("overwrite").option("header", True).csv("/app/data/raw/clientes_opt.csv")


                                                                                

In [15]:
spark.stop()
