In [0]:
#Step 1: Set Up Required Parameters
NUM_CUSTOMER_ACCOUNTS = 500
NUM_ATMS = 100
CUSTOMER_ID_START = 10000000
CUSTOMER_ACCOUNT_ID_START = 100000000
ATM_CASH_ACCOUNT_ID_START = 200000000

In [0]:
#Step 2: Import Libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
import random
import uuid
from datetime import datetime, timedelta

In [0]:
#Create USERS DataFrame
users_data = [
    (
        i,
        f"user_{i}",
        f"Full Name {i}",
        f"user{i}@bank.com",
        random.choice(["Admin", "Manager", "Clerk"]),
        True,
        datetime.now(),
        datetime.now()
    )
    for i in range(1, 21)
]

users_schema = StructType([
    StructField("user_id", LongType(), False),
    StructField("username", StringType(), False),
    StructField("full_name", StringType(), False),
    StructField("email", StringType(), True),
    StructField("role", StringType(), True),
    StructField("is_active", BooleanType(), False),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

users_df = spark.createDataFrame(users_data, schema=users_schema)

In [0]:
#Create CUSTOMERS DataFrame
customers_data = [
    (
        CUSTOMER_ID_START + i,
        str(uuid.uuid4()),
        f"First{i}",
        f"Last{i}",
        datetime(1980, 1, 1) + timedelta(days=random.randint(0, 15000)),
        random.choice(["Male", "Female"]),
        f"+8801{random.randint(100000000, 999999999)}",
        f"customer{i}@bank.com",
        f"Street {i}",
        "Dhaka",
        "Bangladesh",
        datetime.now(),
        datetime.now()
    )
    for i in range(NUM_CUSTOMER_ACCOUNTS)
]

customers_schema = StructType([
    StructField("customer_id", LongType(), False),
    StructField("customer_unique_id", StringType(), True),
    StructField("first_name", StringType(), False),
    StructField("last_name", StringType(), False),
    StructField("date_of_birth", DateType(), True),
    StructField("gender", StringType(), True),
    StructField("contact_number", StringType(), True),
    StructField("email_address", StringType(), True),
    StructField("address_line1", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

customers_df = spark.createDataFrame(customers_data, schema=customers_schema)

In [0]:
#Create BRANCHES DataFrame
from pyspark.sql.functions import col
import random
from datetime import datetime, timedelta
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, DateType, BooleanType, TimestampType
)

# Create BRANCHES DataFrame
users_list = [row.user_id for row in users_df.select("user_id").collect()]

branches_data = [
    (
        f"B{i:03d}",
        f"Branch {i}",
        f"Address Line {i}",
        "Dhaka",
        "Bangladesh",
        f"+8802{random.randint(1000000, 9999999)}",
        random.choice(users_list),
        datetime(2000, 1, 1) + timedelta(days=random.randint(0, 8000)),
        None,
        True,
        datetime.now(),
        datetime.now()
    )
    for i in range(1, 11)
]

branches_schema = StructType([
    StructField("branch_id", StringType(), False),
    StructField("branch_name", StringType(), False),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("contact_number", StringType(), True),
    StructField("manager_user_id", LongType(), True),
    StructField("opening_date", DateType(), True),
    StructField("closing_date", DateType(), True),
    StructField("is_active", BooleanType(), False),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

branches_df = spark.createDataFrame(branches_data, schema=branches_schema)


In [0]:
# Setup
currencies = ["BDT", "USD"]
account_types = ["Savings", "Checking"]
account_statuses = ["Active", "Dormant", "Closed"]

# GL Code Mapping
gl_code_map = {
    "Savings": "101001",
    "Checking": "101002",
    "ATM": "102001"  # ATM-specific GL account code
}

# Parameters for customers and ATMs
NUM_CUSTOMER_ACCOUNTS = 500
CUSTOMER_ID_START = 10000000
CUSTOMER_ACCOUNT_ID_START = 100000000

NUM_ATMS = 100
ATM_CASH_ACCOUNT_ID_START = 200000000

# Generate customer IDs
customer_id_list = list(range(CUSTOMER_ID_START, CUSTOMER_ID_START + NUM_CUSTOMER_ACCOUNTS))

# Generate Customer Accounts
customer_accounts_data = [
    (
        CUSTOMER_ACCOUNT_ID_START + i,
        f"ACCT{i:08d}",
        customer_id_list[i % len(customer_id_list)],
        account_type := random.choice(account_types),
        currency := random.choice(currencies),
        balance := random.randint(500, 100000),
        open_date := datetime(2015, 1, 1) + timedelta(days=random.randint(0, 3000)),
        close_date := None,
        status := random.choice(account_statuses),
        gl_account_code := gl_code_map[account_type],
        created := datetime.now(),
        created
    )
    for i in range(NUM_CUSTOMER_ACCOUNTS)
]

# Generate ATM Accounts
atm_accounts_data = [
    (
        ATM_CASH_ACCOUNT_ID_START + i,
        f"ATM_ACCT{i:08d}",
        None,  # No customer_id for ATM
        "ATM",
        "BDT",  # Assuming ATM cash account uses BDT
        balance := random.randint(100000, 1000000),
        open_date := datetime(2015, 1, 1) + timedelta(days=random.randint(0, 3000)),
        close_date := None,
        "Active",
        gl_code_map["ATM"],
        created := datetime.now(),
        created
    )
    for i in range(NUM_ATMS)
]

# Merge both datasets
accounts_data = customer_accounts_data + atm_accounts_data

# Define schema
accounts_schema = StructType([
    StructField("account_id", LongType(), False),
    StructField("account_number", StringType(), False),
    StructField("customer_id", LongType(), True),
    StructField("account_type", StringType(), False),
    StructField("currency_code", StringType(), False),
    StructField("current_balance", IntegerType(), False),
    StructField("opening_date", DateType(), False),
    StructField("closing_date", DateType(), True),
    StructField("account_status", StringType(), False),
    StructField("gl_account_code", StringType(), False),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

# Create DataFrame
accounts_df = spark.createDataFrame(accounts_data, schema=accounts_schema)

In [0]:
# Create ACCOUNTS DataFrame
currencies = ["BDT", "USD"]
account_types = ["Savings", "Checking"]
account_statuses = ["Active", "Dormant", "Closed"]
gl_codes = ["GL100", "GL200", "GL300"]

# Let's assume you already have customer_id_list, else create dummy
customer_id_list = list(range(10000001, 10000001 + 500))  # Example 500 customer IDs

accounts_data = [
    (
        CUSTOMER_ACCOUNT_ID_START + i,
        f"ACCT{i:08d}",
        customer_id_list[i % len(customer_id_list)],
        random.choice(account_types),
        random.choice(currencies),
        random.randint(500, 100000),
        datetime(2015, 1, 1) + timedelta(days=random.randint(0, 3000)),
        None,
        random.choice(account_statuses),
        random.choice(gl_codes),
        datetime.now(),
        datetime.now()
    )
    for i in range(NUM_CUSTOMER_ACCOUNTS)
]

accounts_schema = StructType([
    StructField("account_id", LongType(), False),
    StructField("account_number", StringType(), False),
    StructField("customer_id", LongType(), True),
    StructField("account_type", StringType(), False),
    StructField("currency_code", StringType(), False),
    StructField("current_balance", IntegerType(), False),
    StructField("opening_date", DateType(), False),
    StructField("closing_date", DateType(), True),
    StructField("account_status", StringType(), False),
    StructField("gl_account_code", StringType(), False),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

accounts_df = spark.createDataFrame(accounts_data, schema=accounts_schema)


In [0]:
atm_statuses = ["Operational", "Maintenance"]
atm_types = ["Indoor", "Outdoor"]

# Assuming branches_df is already defined and available
branch_ids = [row['branch_id'] for row in branches_df.select("branch_id").collect()]

NUM_ATMS = 100  # Define the number of ATMs
ATM_CASH_ACCOUNT_ID_START = 200000000  # Define the starting cash account ID

atm_data = [
    Row(
        atm_id=f"ATM{i:03d}",
        location_name=f"ATM Location {i}",
        address=f"ATM Street {i}",
        city="Dhaka",
        country="Bangladesh",
        atm_type=random.choice(atm_types),
        atm_status=random.choice(atm_statuses),
        cash_gl_account_id=ATM_CASH_ACCOUNT_ID_START + i,
        branch_id=random.choice(branch_ids),
        last_maintenance_date=datetime.now() - timedelta(days=random.randint(10, 500)),
        created_date=datetime.now(),
        last_updated_date=datetime.now()
    )
    for i in range(NUM_ATMS)
]

atm_schema = StructType([
    StructField("atm_id", StringType(), False),
    StructField("location_name", StringType(), False),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("atm_type", StringType(), True),
    StructField("atm_status", StringType(), False),
    StructField("cash_gl_account_id", LongType(), False),
    StructField("branch_id", StringType(), True),
    StructField("last_maintenance_date", TimestampType(), True),
    StructField("created_date", TimestampType(), False),
    StructField("last_updated_date", TimestampType(), False),
])

atms_df = spark.createDataFrame(atm_data, schema=atm_schema)


In [0]:
display(users_df.limit(5))
display(customers_df.limit(5))
display(branches_df.limit(5))
display(accounts_df.limit(5))
display(atms_df.limit(5))

user_id,username,full_name,email,role,is_active,created_date,last_updated_date
1,user_1,Full Name 1,user1@bank.com,Manager,True,2025-07-23T16:30:13.271Z,2025-07-23T16:30:13.271Z
2,user_2,Full Name 2,user2@bank.com,Clerk,True,2025-07-23T16:30:13.271Z,2025-07-23T16:30:13.271Z
3,user_3,Full Name 3,user3@bank.com,Clerk,True,2025-07-23T16:30:13.271Z,2025-07-23T16:30:13.271Z
4,user_4,Full Name 4,user4@bank.com,Clerk,True,2025-07-23T16:30:13.271Z,2025-07-23T16:30:13.271Z
5,user_5,Full Name 5,user5@bank.com,Clerk,True,2025-07-23T16:30:13.271Z,2025-07-23T16:30:13.271Z


customer_id,customer_unique_id,first_name,last_name,date_of_birth,gender,contact_number,email_address,address_line1,city,country,created_date,last_updated_date
10000000,6705a116-90ab-4aea-bdfb-e37e48adc2d0,First0,Last0,2012-02-25,Male,8801667113320,customer0@bank.com,Street 0,Dhaka,Bangladesh,2025-07-23T16:30:18.307Z,2025-07-23T16:30:18.307Z
10000001,b593cb11-9caf-4bae-ad36-82ca3a199fdc,First1,Last1,1985-01-27,Male,8801480404228,customer1@bank.com,Street 1,Dhaka,Bangladesh,2025-07-23T16:30:18.307Z,2025-07-23T16:30:18.307Z
10000002,03dbbfbf-6cbf-4d4d-88e5-bd9fd3d1152e,First2,Last2,1980-02-23,Male,8801785758969,customer2@bank.com,Street 2,Dhaka,Bangladesh,2025-07-23T16:30:18.307Z,2025-07-23T16:30:18.307Z
10000003,cf1dc2d9-dfb6-4bfe-a2cd-6a7425c64be1,First3,Last3,2010-02-20,Female,8801374799494,customer3@bank.com,Street 3,Dhaka,Bangladesh,2025-07-23T16:30:18.307Z,2025-07-23T16:30:18.307Z
10000004,8bca6f40-f8d8-4673-9d14-227702af708f,First4,Last4,1991-12-28,Male,8801801245974,customer4@bank.com,Street 4,Dhaka,Bangladesh,2025-07-23T16:30:18.307Z,2025-07-23T16:30:18.307Z


branch_id,branch_name,address,city,country,contact_number,manager_user_id,opening_date,closing_date,is_active,created_date,last_updated_date
B001,Branch 1,Address Line 1,Dhaka,Bangladesh,88022046523,13,2021-05-10,,True,2025-07-23T16:30:45.087Z,2025-07-23T16:30:45.087Z
B002,Branch 2,Address Line 2,Dhaka,Bangladesh,88026419769,10,2009-07-02,,True,2025-07-23T16:30:45.087Z,2025-07-23T16:30:45.087Z
B003,Branch 3,Address Line 3,Dhaka,Bangladesh,88024007428,19,2014-10-14,,True,2025-07-23T16:30:45.087Z,2025-07-23T16:30:45.087Z
B004,Branch 4,Address Line 4,Dhaka,Bangladesh,88025205974,9,2001-04-18,,True,2025-07-23T16:30:45.087Z,2025-07-23T16:30:45.087Z
B005,Branch 5,Address Line 5,Dhaka,Bangladesh,88023988158,6,2003-02-20,,True,2025-07-23T16:30:45.087Z,2025-07-23T16:30:45.087Z


account_id,account_number,customer_id,account_type,currency_code,current_balance,opening_date,closing_date,account_status,gl_account_code,created_date,last_updated_date
100000000,ACCT00000000,10000001,Savings,BDT,53004,2017-01-28,,Active,GL200,2025-07-23T16:31:40.825Z,2025-07-23T16:31:40.825Z
100000001,ACCT00000001,10000002,Checking,USD,88282,2019-09-12,,Closed,GL100,2025-07-23T16:31:40.825Z,2025-07-23T16:31:40.825Z
100000002,ACCT00000002,10000003,Checking,BDT,44311,2019-08-07,,Active,GL200,2025-07-23T16:31:40.825Z,2025-07-23T16:31:40.825Z
100000003,ACCT00000003,10000004,Savings,BDT,33887,2020-12-13,,Closed,GL200,2025-07-23T16:31:40.825Z,2025-07-23T16:31:40.825Z
100000004,ACCT00000004,10000005,Savings,USD,98866,2019-03-22,,Closed,GL300,2025-07-23T16:31:40.825Z,2025-07-23T16:31:40.825Z


atm_id,location_name,address,city,country,atm_type,atm_status,cash_gl_account_id,branch_id,last_maintenance_date,created_date,last_updated_date
ATM000,ATM Location 0,ATM Street 0,Dhaka,Bangladesh,Indoor,Maintenance,200000000,B008,2024-08-25T16:31:47.388Z,2025-07-23T16:31:47.388Z,2025-07-23T16:31:47.388Z
ATM001,ATM Location 1,ATM Street 1,Dhaka,Bangladesh,Indoor,Operational,200000001,B008,2024-06-20T16:31:47.388Z,2025-07-23T16:31:47.388Z,2025-07-23T16:31:47.388Z
ATM002,ATM Location 2,ATM Street 2,Dhaka,Bangladesh,Outdoor,Operational,200000002,B008,2024-07-16T16:31:47.388Z,2025-07-23T16:31:47.388Z,2025-07-23T16:31:47.388Z
ATM003,ATM Location 3,ATM Street 3,Dhaka,Bangladesh,Indoor,Maintenance,200000003,B005,2024-04-20T16:31:47.388Z,2025-07-23T16:31:47.388Z,2025-07-23T16:31:47.388Z
ATM004,ATM Location 4,ATM Street 4,Dhaka,Bangladesh,Indoor,Operational,200000004,B003,2024-04-30T16:31:47.388Z,2025-07-23T16:31:47.388Z,2025-07-23T16:31:47.388Z


In [0]:
users_df.write.mode("overwrite").format("delta").saveAsTable("bank_cbs.bronze.users")
customers_df.write.mode("overwrite").format("delta").saveAsTable("bank_cbs.bronze.customers")
branches_df.write.mode("overwrite").format("delta").saveAsTable("bank_cbs.bronze.branches")
accounts_df.write.mode("overwrite").format("delta").saveAsTable("bank_cbs.bronze.accounts")
atms_df.write.mode("overwrite").format("delta").saveAsTable("bank_cbs.bronze.atms")