# Silver Layer

In [0]:
from pyspark.sql.functions import col, when, current_date

##Chargement des secrets

In [0]:
storage_account_name = dbutils.secrets.get(scope="azure-storage", key="storage-account-name")
storage_account_key = dbutils.secrets.get(scope="azure-storage", key="storage-account-key")

## Chargement des données brutes

In [0]:
df_bronze = spark.table("bronze_bank_transactions")
display(df_bronze)

TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP_Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate
TX000001,AC00128,14.09,2023-04-11T16:29:14Z,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04T08:08:08Z
TX000002,AC00455,376.24,2023-06-27T16:44:19Z,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04T08:09:35Z
TX000003,AC00019,126.29,2023-07-10T18:16:08Z,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04T08:07:04Z
TX000004,AC00070,184.5,2023-05-05T16:32:11Z,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04T08:09:06Z
TX000005,AC00411,13.45,2023-10-16T17:51:24Z,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04T08:06:39Z
TX000006,AC00393,92.15,2023-04-03T17:15:01Z,Debit,Oklahoma City,D000579,117.67.192.211,M054,ATM,18,Student,172,1,781.68,2024-11-04T08:06:36Z
TX000007,AC00199,7.08,2023-02-15T16:36:48Z,Credit,Seattle,D000241,140.212.253.222,M019,ATM,37,Doctor,139,1,13316.71,2024-11-04T08:10:09Z
TX000008,AC00069,171.42,2023-05-08T17:47:59Z,Credit,Indianapolis,D000500,92.214.76.157,M020,Branch,67,Retired,291,1,2796.24,2024-11-04T08:10:55Z
TX000009,AC00135,106.23,2023-03-21T16:59:46Z,Credit,Detroit,D000690,24.148.92.177,M035,Branch,51,Engineer,86,1,9095.14,2024-11-04T08:11:14Z
TX000010,AC00385,815.96,2023-03-31T16:06:57Z,Debit,Nashville,D000199,32.169.88.41,M007,ATM,55,Doctor,120,1,1021.88,2024-11-04T08:06:32Z


chargement de la table de correspondance ville/etat USA

In [0]:
df_city_state = spark.table("city_state_mapping")
display(df_city_state)

City,State
Austin,Texas
Oklahoma City,Oklahoma
Memphis,Tennessee
Fort Worth,Texas
Detroit,Michigan
Jacksonville,Florida
Tucson,Arizona
Colorado Springs,Colorado
San Jose,California
Los Angeles,California


## Traitement des données

Gestion valeurs nulles et manquantes

In [0]:
df_silver = df_bronze.fillna({
    "TransactionAmount": 0,
    "Location": "Unknown",
    "TransactionType": "Unknown",
})

Suppression des lignes dupliquées

In [0]:
df_silver = df_silver.dropDuplicates()

Filtre sur les valeurs abérrantes

In [0]:
df_silver = df_silver.filter(col("TransactionAmount") > 0)

df_silver = df_silver.filter(col("TransactionDate") <= current_date())

Ajout d'une colonne sur le type de transaction pour des analyses futures

In [0]:
df_silver = df_silver.withColumn(
    "transaction_category",
    when(col("TransactionAmount") > 1000, "High Value").otherwise("Regular")
)
display(df_silver)

TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP_Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,transaction_category
TX000414,AC00400,135.03,2023-02-24T17:00:26Z,Debit,San Francisco,D000029,74.58.165.67,M097,ATM,23,Student,180,1,1278.0,2024-11-04T08:08:18Z,Regular
TX000474,AC00464,35.84,2023-03-28T17:23:54Z,Credit,Colorado Springs,D000422,35.73.67.98,M013,Online,54,Retired,172,1,4795.65,2024-11-04T08:10:57Z,Regular
TX001016,AC00416,285.18,2023-03-08T16:39:14Z,Credit,Sacramento,D000515,110.247.153.70,M084,Branch,55,Doctor,150,1,14847.97,2024-11-04T08:09:38Z,Regular
TX001209,AC00267,220.2,2023-09-11T16:25:04Z,Debit,Boston,D000288,23.255.214.209,M047,Online,28,Student,102,1,1075.15,2024-11-04T08:11:18Z,Regular
TX001552,AC00310,757.15,2023-01-17T16:53:08Z,Debit,Raleigh,D000463,200.13.225.150,M085,Online,53,Engineer,38,1,4702.86,2024-11-04T08:08:06Z,Regular
TX001710,AC00215,180.94,2023-07-24T16:41:18Z,Debit,Raleigh,D000456,45.124.239.44,M028,Branch,75,Retired,57,1,7659.99,2024-11-04T08:10:34Z,Regular
TX000026,AC00041,7.49,2023-10-20T17:53:13Z,Credit,Houston,D000671,81.248.235.147,M043,Online,65,Retired,276,1,3634.67,2024-11-04T08:09:44Z,Regular
TX000090,AC00437,392.96,2023-11-20T16:28:09Z,Debit,Kansas City,D000141,186.54.68.246,M022,Online,24,Student,100,1,791.65,2024-11-04T08:08:38Z,Regular
TX000358,AC00459,374.5,2023-07-24T16:45:31Z,Credit,San Francisco,D000045,9.230.41.239,M002,Branch,26,Student,178,1,1671.51,2024-11-04T08:08:37Z,Regular
TX000488,AC00128,710.14,2023-03-08T16:20:43Z,Debit,El Paso,D000667,83.207.115.164,M059,ATM,26,Student,227,1,465.63,2024-11-04T08:12:20Z,Regular


Ajout d'une colonne State

In [0]:
df_silver = df_silver.join(
    df_city_state,
    df_silver["Location"] == df_city_state["City"],
    "left"
).drop("City")
display(df_silver)

TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP_Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,transaction_category,State
TX000414,AC00400,135.03,2023-02-24T17:00:26Z,Debit,San Francisco,D000029,74.58.165.67,M097,ATM,23,Student,180,1,1278.0,2024-11-04T08:08:18Z,Regular,California
TX000474,AC00464,35.84,2023-03-28T17:23:54Z,Credit,Colorado Springs,D000422,35.73.67.98,M013,Online,54,Retired,172,1,4795.65,2024-11-04T08:10:57Z,Regular,Colorado
TX001016,AC00416,285.18,2023-03-08T16:39:14Z,Credit,Sacramento,D000515,110.247.153.70,M084,Branch,55,Doctor,150,1,14847.97,2024-11-04T08:09:38Z,Regular,California
TX001209,AC00267,220.2,2023-09-11T16:25:04Z,Debit,Boston,D000288,23.255.214.209,M047,Online,28,Student,102,1,1075.15,2024-11-04T08:11:18Z,Regular,Massachusetts
TX001552,AC00310,757.15,2023-01-17T16:53:08Z,Debit,Raleigh,D000463,200.13.225.150,M085,Online,53,Engineer,38,1,4702.86,2024-11-04T08:08:06Z,Regular,North Carolina
TX001710,AC00215,180.94,2023-07-24T16:41:18Z,Debit,Raleigh,D000456,45.124.239.44,M028,Branch,75,Retired,57,1,7659.99,2024-11-04T08:10:34Z,Regular,North Carolina
TX000026,AC00041,7.49,2023-10-20T17:53:13Z,Credit,Houston,D000671,81.248.235.147,M043,Online,65,Retired,276,1,3634.67,2024-11-04T08:09:44Z,Regular,Texas
TX000090,AC00437,392.96,2023-11-20T16:28:09Z,Debit,Kansas City,D000141,186.54.68.246,M022,Online,24,Student,100,1,791.65,2024-11-04T08:08:38Z,Regular,Missouri
TX000358,AC00459,374.5,2023-07-24T16:45:31Z,Credit,San Francisco,D000045,9.230.41.239,M002,Branch,26,Student,178,1,1671.51,2024-11-04T08:08:37Z,Regular,California
TX000488,AC00128,710.14,2023-03-08T16:20:43Z,Debit,El Paso,D000667,83.207.115.164,M059,ATM,26,Student,227,1,465.63,2024-11-04T08:12:20Z,Regular,Texas


Deplacement de colonnes pour plus de visibilité

In [0]:
columns = df_silver.columns  
columns.remove("State")  
location_index = columns.index("Location")

columns.insert(location_index + 1, "State")

df_silver = df_silver.select(*columns)

display(df_silver)

TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,State,DeviceID,IP_Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,transaction_category
TX000414,AC00400,135.03,2023-02-24T17:00:26Z,Debit,San Francisco,California,D000029,74.58.165.67,M097,ATM,23,Student,180,1,1278.0,2024-11-04T08:08:18Z,Regular
TX000474,AC00464,35.84,2023-03-28T17:23:54Z,Credit,Colorado Springs,Colorado,D000422,35.73.67.98,M013,Online,54,Retired,172,1,4795.65,2024-11-04T08:10:57Z,Regular
TX001016,AC00416,285.18,2023-03-08T16:39:14Z,Credit,Sacramento,California,D000515,110.247.153.70,M084,Branch,55,Doctor,150,1,14847.97,2024-11-04T08:09:38Z,Regular
TX001209,AC00267,220.2,2023-09-11T16:25:04Z,Debit,Boston,Massachusetts,D000288,23.255.214.209,M047,Online,28,Student,102,1,1075.15,2024-11-04T08:11:18Z,Regular
TX001552,AC00310,757.15,2023-01-17T16:53:08Z,Debit,Raleigh,North Carolina,D000463,200.13.225.150,M085,Online,53,Engineer,38,1,4702.86,2024-11-04T08:08:06Z,Regular
TX001710,AC00215,180.94,2023-07-24T16:41:18Z,Debit,Raleigh,North Carolina,D000456,45.124.239.44,M028,Branch,75,Retired,57,1,7659.99,2024-11-04T08:10:34Z,Regular
TX000026,AC00041,7.49,2023-10-20T17:53:13Z,Credit,Houston,Texas,D000671,81.248.235.147,M043,Online,65,Retired,276,1,3634.67,2024-11-04T08:09:44Z,Regular
TX000090,AC00437,392.96,2023-11-20T16:28:09Z,Debit,Kansas City,Missouri,D000141,186.54.68.246,M022,Online,24,Student,100,1,791.65,2024-11-04T08:08:38Z,Regular
TX000358,AC00459,374.5,2023-07-24T16:45:31Z,Credit,San Francisco,California,D000045,9.230.41.239,M002,Branch,26,Student,178,1,1671.51,2024-11-04T08:08:37Z,Regular
TX000488,AC00128,710.14,2023-03-08T16:20:43Z,Debit,El Paso,Texas,D000667,83.207.115.164,M059,ATM,26,Student,227,1,465.63,2024-11-04T08:12:20Z,Regular


## Sauvegarde des données dans un table silver

In [0]:
df_silver.write.format("delta").mode("overwrite").saveAsTable("silver_bank_transactions")

##Sauvegarde des données dans un conteneur Azure

In [0]:
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
    storage_account_key
)

In [0]:
df_silver.write.format("delta").mode("overwrite").save(f"wasbs://silver-data@{storage_account_name}.blob.core.windows.net/silver")