In [1]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DecimalType
from pyspark.sql.functions import udf, current_date,  to_date, date_format, lit
from pyspark.sql.types import BinaryType
import uuid



In [2]:
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk1.8.0_202'
os.environ['PATH'] = os.environ['JAVA_HOME'] + r'\bin;' + os.environ['PATH']
driver_path = "C:/ocDB/WINDOWS.X64_193000_db_home/jdbc/lib/ojdbc8.jar"
connection_url = "jdbc:oracle:thin:@//localhost:1521/orcl"
user = "sys as SYSDBA"
password = "root"
directory = "Volumes/dev/tiintegracao/team/cartoes/cext"
start_pattern = "CEXT_756"
end_extension = ".CCB"

In [3]:
spark = SparkSession.builder \
    .appName("Exemplo de Spark JDBC com Oracle") \
    .config("spark.driver.extraClassPath", driver_path) \
    .getOrCreate()

In [4]:
def add_uuid(df,col_name):
    
    def uuid_bytes():
        return uuid.uuid4().bytes

    
    udf_uuid_bytes = udf(uuid_bytes, BinaryType())

    
    df_with_uuid = df.withColumn(col_name, udf_uuid_bytes())

    return df_with_uuid

In [5]:
def queryData(query):
    df = spark.read.format("jdbc") \
        .option("url", connection_url) \
        .option("driver", "oracle.jdbc.OracleDriver") \
        .option("query", query) \
        .option("user", user) \
        .option("password", password) \
        .load()
    return df

In [6]:
def process_file(filepath,file_name):
    rows = []
    with open(filepath, 'r') as file:
        lines = file.readlines()
        line_count = 1
        for line in lines[1:-1]: 
            card_number = line[6:25].strip() 
            date = line[34:42].strip()
            rows.append({ 
                          "card_number": card_number,
                          "date": date,
                          "line_content":line,
                          "file_name":file_name,
                          "line_number":line_count})
            line_count+=1
    return rows

In [7]:

def filter_files(directory, start_pattern, end_extension):
    filtered_files = []
    
    for filename in os.listdir(directory):
        if filename.startswith(start_pattern) and filename.endswith(end_extension):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                lines = file.readlines()
                if lines[0].startswith("CEXT0") and lines[-1].startswith("CEXT9"):
                    filtered_files.append(filename)
    
    return filtered_files

In [8]:
files_list = filter_files(directory, start_pattern, end_extension)

In [9]:
display(files_list)

['CEXT_7562011_20240125_0002504.CCB']

In [10]:
all_rows = []
for file in files_list:
    all_rows.extend(process_file(directory+'/'+file,file))

In [11]:
with open("data.json", "w") as f:
    json.dump(all_rows, f)

In [12]:
df_transactions = spark.createDataFrame(all_rows)
df_transactions = df_transactions.withColumn("card_number", df_transactions["card_number"].cast(DecimalType(16, 0)))

In [13]:
df_transactions.show()

+----------------+--------+--------------------+--------------------+-----------+
|     card_number|    date|           file_name|        line_content|line_number|
+----------------+--------+--------------------+--------------------+-----------+
|5151070044381239|20230612|CEXT_7562011_2024...|75600051510700443...|          1|
|5151070044381239|20230612|CEXT_7562011_2024...|75600051510700443...|          2|
|5151070044381239|20230612|CEXT_7562011_2024...|75600051510700443...|          3|
|5151070432093151|20230612|CEXT_7562011_2024...|75600051510704320...|          4|
|5151070432093151|20230612|CEXT_7562011_2024...|75600051510704320...|          5|
|5151070432093151|20230612|CEXT_7562011_2024...|75600051510704320...|          6|
|5151070432093151|20230612|CEXT_7562011_2024...|75600051510704320...|          7|
|5151940230409696|20230612|CEXT_7562011_2024...|75600051519402304...|          8|
|5151940230409696|20230612|CEXT_7562011_2024...|75600051519402304...|          9|
|515107004438701

In [14]:
df_cards = queryData('SELECT * FROM CARTOES.TB_CARTAO')
df_control = queryData("""
        SELECT IDARQUIVO_CONTROLE 
        FROM CARTOES.TB_ARQUIVO_CONTROLE 
        WHERE NRARQUIVO = 2
        """)

In [15]:
df_join = df_transactions.join(df_cards, df_transactions["card_number"] == df_cards["NRCARTAO"], "inner")
df_transactions = df_join.select("date", "card_number","file_name","line_content","line_number")
del df_join

In [16]:
df_transactions
df_transactions = df_transactions.withColumn('date_now', current_date()) \
                                 .withColumn("date", to_date(df_transactions["date"], "yyyyMMdd"))
df_transactions = df_transactions.withColumn("date", date_format(df_transactions["date"], "dd-MM-yyyy")) \
                                 .withColumn("date_now", date_format(df_transactions["date_now"], "dd-MM-yyyy"))

df_transactions = df_transactions.withColumn('IDARQUIVO', lit(uuid.uuid4().bytes))
df_transactions = df_transactions.withColumn('CDSITUACAO', lit(1))

df_transactions = add_uuid(df_transactions,"IDARQUIVO_LINHA")

In [17]:
df_transactions.show()

+----------+----------------+--------------------+--------------------+-----------+----------+--------------------+----------+--------------------+
|      date|     card_number|           file_name|        line_content|line_number|  date_now|           IDARQUIVO|CDSITUACAO|     IDARQUIVO_LINHA|
+----------+----------------+--------------------+--------------------+-----------+----------+--------------------+----------+--------------------+
|12-06-2023|5151070044381239|CEXT_7562011_2024...|75600051510700443...|          1|06-06-2024|[51 5B 1D F1 53 3...|         1|[DE EC E0 B5 38 8...|
|12-06-2023|5151070044381239|CEXT_7562011_2024...|75600051510700443...|          2|06-06-2024|[51 5B 1D F1 53 3...|         1|[8D D2 F0 25 94 C...|
|12-06-2023|5151070044381239|CEXT_7562011_2024...|75600051510700443...|          3|06-06-2024|[51 5B 1D F1 53 3...|         1|[40 F2 1F 73 97 B...|
|12-06-2023|5151070044387015|CEXT_7562011_2024...|75600051510700443...|         10|06-06-2024|[51 5B 1D F1 53 3.

In [18]:
df_archive = df_transactions.withColumnRenamed('file_name', 'NMARQUIVO') \
                            .withColumnRenamed('date', 'DTARQUIVO') \
                            .withColumnRenamed('date_now', 'DHREGISTRO') \
                            .drop("card_number") \
                            .drop("line_content") \
                            .drop("IDARQUIVO_LINHA") \
                            .drop("line_number") \
                            .drop("CDSITUACAO")
                            
df_archive_line = df_transactions.drop("card_number")\
                                 .drop("file_name")\
                                 .withColumnRenamed('line_content', 'DSCONTEUDO') \
                                 .withColumnRenamed('date', 'DTPROCESSO') \
                                 .withColumnRenamed('line_number', 'NRLINHA') \
                                 .withColumnRenamed('date_now', 'DHREGISTRO') \
                                 

In [19]:
df_archive_line.show()

+----------+--------------------+-------+----------+--------------------+----------+--------------------+
|DTPROCESSO|          DSCONTEUDO|NRLINHA|DHREGISTRO|           IDARQUIVO|CDSITUACAO|     IDARQUIVO_LINHA|
+----------+--------------------+-------+----------+--------------------+----------+--------------------+
|12-06-2023|75600051510700443...|      1|06-06-2024|[51 5B 1D F1 53 3...|         1|[DA ED D2 62 CD C...|
|12-06-2023|75600051510700443...|      2|06-06-2024|[51 5B 1D F1 53 3...|         1|[8A 2C B0 0C 20 7...|
|12-06-2023|75600051510700443...|      3|06-06-2024|[51 5B 1D F1 53 3...|         1|[04 34 B5 37 F0 2...|
|12-06-2023|75600051510700443...|     10|06-06-2024|[51 5B 1D F1 53 3...|         1|[0B 73 72 C0 C6 3...|
+----------+--------------------+-------+----------+--------------------+----------+--------------------+



In [20]:
df_archive = df_archive.crossJoin(df_control)

In [21]:
df_archive = df_archive.distinct()
df_archive.show()

+----------+--------------------+----------+--------------------+--------------------+
| DTARQUIVO|           NMARQUIVO|DHREGISTRO|           IDARQUIVO|  IDARQUIVO_CONTROLE|
+----------+--------------------+----------+--------------------+--------------------+
|12-06-2023|CEXT_7562011_2024...|06-06-2024|[51 5B 1D F1 53 3...|[FF 95 CE C4 CE 7...|
+----------+--------------------+----------+--------------------+--------------------+



In [22]:
df_archive.write.jdbc(
        url=connection_url,
        table="CARTOES.TB_ARQUIVO",
        mode="append",
        properties={"user": user, "password": password, "driver": "oracle.jdbc.driver.OracleDriver"}
    )

In [23]:
df_archive_line.write.jdbc(
        url=connection_url,
        table="CARTOES.TB_ARQUIVO_LINHA",
        mode="append",
        properties={"user": user, "password": password, "driver": "oracle.jdbc.driver.OracleDriver"}
    )