In [2]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os
load_dotenv()

spark = SparkSession.builder \
    .appName("SessionCentexETL") \
    .master("local[4]") \
    .config("spark.sql.shuffle.partitions", 4) \
    .config("spark.driver.memory", "4g") \
    .config("spark.jars", "C:\\jars\\mssql-jdbc-12.10.0.jre11.jar") \
    .getOrCreate()

jdbc_url = f"jdbc:sqlserver://{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')};databaseName={os.getenv('DB_NAME')};encrypt=true;trustServerCertificate=true"
properties = {
    "user": os.getenv('DB_USER'),
    "password": os.getenv('DB_PASSWORD'),
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

In [15]:
from pyspark.sql import functions as F

query_cl = "(select ag.AgendaNombre, dc.ID_Agenda from Doc_Compra dc inner join Agenda ag on ag.ID_Agenda = dc.ID_Agenda where len(dc.ID_Agenda) = 11 or len(dc.ID_Agenda) = 8 group by ag.AgendaNombre, dc.ID_Agenda) AS query_clean"
df_agenda_prueba = spark.read.jdbc(url=jdbc_url, table=query_cl, properties=properties)
df_validos = df_agenda_prueba.filter(
    (F.length("ID_Agenda").isin([8, 11])) &
    (F.col("ID_Agenda").rlike("^[0-9]+$"))
)

df_no_validos = df_agenda_prueba.filter(
    ~((F.length("ID_Agenda").isin([8, 11])) & 
      (F.col("ID_Agenda").rlike("^[0-9]+$")))
)

df_duplicados = df_validos.groupBy("AgendaNombre").count().filter(F.col("count") > 1)

df_duplicados.show(truncate=False)

print("Total registros:", df_agenda_prueba.count())
print("Registros válidos:", df_validos.count())
print("Registros no válidos:", df_no_validos.count())

#df_validos.show(truncate=False)


+-------------------------------+-----+
|AgendaNombre                   |count|
+-------------------------------+-----+
|CHAMBILLA PEREZ, YHONATAN      |2    |
|AZERRAD TENSERA, WILLIAM MOISES|2    |
|ARIMUYA MORALES, JULIO         |2    |
|HUAMANI CALSIN, JESSELA        |2    |
+-------------------------------+-----+

Total registros: 422
Registros válidos: 418
Registros no válidos: 4


In [13]:
import pandas as pd
df_pd_val = pd.DataFrame(df_validos.toPandas())
df_pd_noval = pd.DataFrame(df_no_validos.toPandas())
df_pd_val.to_csv("DevueltosCSV/validos.csv", index=False, encoding='utf-8-sig', header=None) 
df_pd_noval.to_csv("DevueltosCSV/novalidos.csv", index=False, encoding='utf-8-sig', header=None)

In [8]:
df_no_validos.show(truncate=False)

+-------------------------------+-----------+
|AgendaNombre                   |ID_Agenda  |
+-------------------------------+-----------+
|BRUKER NANO GMBH               |DE200443246|
|BRUKER AXS SE                  |DE812037551|
|PEAK SCIENTIFIC INSTRUMENTS LTD|GB699501784|
|BRUKER DALTONIK GMBH           |DE114404287|
+-------------------------------+-----------+

