# Setup for experiments

In [1]:
# Imports
import io
import boto3
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cryptography.fernet import Fernet
from io import BytesIO
from botocore.exceptions import NoCredentialsError, ClientError
from dotenv import load_dotenv
import snowflake.connector
import os

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
BUCKET_NAME = "rher-s3-test-bucket"
RAW_DATA_FILE_NAME = "employees.parquet"
SALARY_ENCRYPTED_FILE_NAME = "sample_sensitive_data_encrypted.parquet"
COLUMN_TO_ENCRYPT1 = "Salary" 
COLUMN_TO_ENCRYPT2 = "Password"
FERNET_KEY = os.getenv("FERNET_KEY")
ADMIN_KEY = os.getenv("ADMIN_KEY")

## Utility Functions

In [None]:
def list_s3_objects(bucket_name):
    try:
        s3 = boto3.client("s3")
        response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' not in response:
            print(f"No objects found in bucket: {bucket_name}")
            return []
        print(f"Objects in bucket '{bucket_name}':")
        for obj in response['Contents']:
            print(f" - {obj['Key']} (LastModified: {obj['LastModified']}, Size: {obj['Size']} bytes)")
        return [obj['Key'] for obj in response['Contents']]

    except NoCredentialsError:
        print("AWS credentials not found. Please configure them first.")
        return []
    except ClientError as e:
        print(f" AWS Client Error: {e}")
        return []
    except Exception as e:
        print(f" Unexpected error: {e}")
        return []

def retrieve_data(bucket_name, file_name):
    """Read a Parquet file from S3"""
    s3 = boto3.client("s3")
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    data = BytesIO(response['Body'].read())
    df = pd.read_parquet(data, engine="pyarrow")
    print(f"\n successfully loaded '{file_name}' into DataFrame.")
    return df


def encrypt_column_in_parquet(df, column_to_encrypt, fernet_key):
    """Encrypt a specific column in a Parquet file and upload the result to S3."""
    fernet = Fernet(fernet_key)
    print(f"Encrypting column: {column_to_encrypt}")
    if column_to_encrypt in df.columns:
        df[column_to_encrypt] = df[column_to_encrypt].astype(str).apply(
            lambda x: fernet.encrypt(x.encode()).decode()
        )
    else:
        raise ValueError(f"Column '{column_to_encrypt}' not found in Parquet file")
    return df

    
def upload_encrypted_parquet_to_s3(bucket_name, output_file_location, df):
    """Upload the encrypted DataFrame as a Parquet file to S3."""
    s3 = boto3.client("s3")
    buffer = io.BytesIO()
    table = pa.Table.from_pandas(df)
    pq.write_table(table, buffer)
    buffer.seek(0)
    s3.put_object(Bucket=bucket_name, Key=output_file_location, Body=buffer.getvalue())
    print(f"Encrypted Parquet file uploaded to s3://{bucket_name}/{output_file_location}")


## Encrypt salary column in file

In [5]:
df = retrieve_data(BUCKET_NAME, RAW_DATA_FILE_NAME)
df = encrypt_column_in_parquet(df, COLUMN_TO_ENCRYPT1, FERNET_KEY)
df = encrypt_column_in_parquet(df, COLUMN_TO_ENCRYPT2, ADMIN_KEY)
upload_encrypted_parquet_to_s3(BUCKET_NAME, SALARY_ENCRYPTED_FILE_NAME, df)


 successfully loaded 'employees.parquet' into DataFrame.
Encrypting column: Salary
Encrypting column: Password
Encrypted Parquet file uploaded to s3://rher-s3-test-bucket/sample_sensitive_data_encrypted.parquet


# Experiment 1 – Show Encrypted Data Cannot Be Read

**Goal:** Demonstrate that encrypted data stored in an AWS S3 bucket cannot be read
without the appropriate decryption key / permissions.

We will:

1. List objects in the test S3 bucket.
2. Attempt to read a Parquet file with and *encrypted* column
3. Attempt to read the same encrypted data via Snowflake external stage.


## List all .parquet fiels in bucket

In [6]:
all_keys = list_s3_objects(BUCKET_NAME)

all_parquet_files = [k for k in all_keys if k.endswith(".parquet")]
print("\nParquet objects found:")
for k in all_parquet_files:
    print(" -", k)

encrypted_candidates = [k for k in all_parquet_files if "encrypted" in k.lower()]

print("\nEncrypted candidates:")
for k in encrypted_candidates:
    print(" -", k)


Objects in bucket 'rher-s3-test-bucket':
 - employees.parquet (LastModified: 2025-11-20 14:17:33+00:00, Size: 4796 bytes)
 - sample_sensitive_data.parquet (LastModified: 2025-11-03 11:16:22+00:00, Size: 2152 bytes)
 - sample_sensitive_data_encrypted.parquet (LastModified: 2025-11-25 10:25:17+00:00, Size: 7880 bytes)

Parquet objects found:
 - employees.parquet
 - sample_sensitive_data.parquet
 - sample_sensitive_data_encrypted.parquet

Encrypted candidates:
 - sample_sensitive_data_encrypted.parquet


In [7]:
raw_data_df = retrieve_data(BUCKET_NAME, all_parquet_files[0])
encrypted_data_df = retrieve_data(BUCKET_NAME, encrypted_candidates[0])
print(f"DataFrame shape: {raw_data_df.shape}")
print(f"DataFrame shape: {encrypted_data_df.shape}")


 successfully loaded 'employees.parquet' into DataFrame.

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.
DataFrame shape: (10, 6)
DataFrame shape: (10, 6)


In [8]:
raw_data_df.head()

Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,55000,2o1e*OBZWW^7Kd
1,2,Bob,bob@example.com,Engineering,72000,GSxZX$zD5Qlf8^
2,3,Charlie,charlie@example.com,Marketing,63000,02QChWYlypvP#i
3,4,David,david@example.com,Finance,80000,m5AKDGR88&T*A8
4,5,Eva,eva@example.com,Engineering,75000,Yt%hUAUT2RX5$A


In [9]:
encrypted_data_df.head()

Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,gAAAAABpJYQMWuZVcz-2L9J5mFFa17-hOB0XlpiblNur_b...,gAAAAABpJYQM62ZaUt4UNCaYUJIDT_S5Dfv4EIlfWkbQ-T...
1,2,Bob,bob@example.com,Engineering,gAAAAABpJYQMXftaVhfgBaCGPb0vmvjyH5XEGtUh62WZGj...,gAAAAABpJYQMw20EDl_2t5ZvPg7GJnMcs7l1LDBKgP82YM...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpJYQMWoxHKRJIuARtkAwyYUk46TOOOGOVqBIEer...,gAAAAABpJYQMxVTmf0h7sSIYGzKjLnpg1eWBOIE242GgAt...
3,4,David,david@example.com,Finance,gAAAAABpJYQMpMukaf4SbqijwKfAbILawjhwkMMe9_sApb...,gAAAAABpJYQMF0RYS_YklX4XwIh56ApJzltF0wreEfUT5z...
4,5,Eva,eva@example.com,Engineering,gAAAAABpJYQMp7xA0Q1_VkL2ODSIpl2PCcPYE4ljF252rF...,gAAAAABpJYQMGwRJc6EvrmC0eHypTPozNZffh0H2thafET...


# Experiment 2 – Simulate Key Management Service + RBAC

In [10]:

ENGINEERING = "RBAC_IN_DATA_LAKES_ROLE_READ_ONLY"
HR = "RBAC_IN_DATA_LAKES_ROLE_SECURE_ANALYST"
ADMIN   = "RBAC_IN_DATA_LAKES_ROLE_ADMIN"

def decrypt_salary(encrypted_column):
    """Real decrypt (KMS primitive). Only this function knows the key."""
    decrypted_bytes = Fernet(FERNET_KEY).decrypt(encrypted_column.encode())
    return int(decrypted_bytes.decode())

def get_key(role):
    if role == HR:
        return {
            "fernet_key": FERNET_KEY
        } # or a wrapped key
    elif role == ADMIN:
        return {
            "fernet_key": FERNET_KEY,
            "admin_key": ADMIN_KEY
        }
    elif role == ENGINEERING:
        return None  
    else:
        return None
    
def retrieve_data(bucket_name, file_name):
    """Read a Parquet file from S3"""
    s3 = boto3.client("s3")
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    data = BytesIO(response['Body'].read())
    df = pd.read_parquet(data, engine="pyarrow")
    print(f"\n successfully loaded '{file_name}' into DataFrame.")
    return df

def decrypt_salary_with_key(key, file, encrypted_column):
    if key is None:
        print("No key provided, cannot decrypt.")
        return file
    fernet = Fernet(key)
    df = file.copy()
    df[encrypted_column] = df[encrypted_column].apply(
        lambda s: int(fernet.decrypt(s.encode()).decode())
    )
    return df

def decrypt_password_with_key(key, file, encrypted_column):
    if key is None:
        print("No key provided, cannot decrypt.")
        return file
    fernet = Fernet(key)
    df = file.copy()
    df[encrypted_column] = df[encrypted_column].apply(
        lambda s: str(fernet.decrypt(s.encode()).decode())
    )
    return df


def simulate_kms(df, encrypted_column, role):
    if role == HR:
        df = df.copy()
        df[encrypted_column] = df[encrypted_column].apply(
            lambda s: decrypt_salary(s)
        )
        return df
    else:
        return df
    
    

## Test with Read-only Role

In [11]:
key = get_key(HR)
print("Retrieved key for HR:", key)
file = retrieve_data(BUCKET_NAME, SALARY_ENCRYPTED_FILE_NAME)
decrypt_salary_with_key(key["fernet_key"], file, "Salary")


Retrieved key for HR: {'fernet_key': 'U1eIY6p4bKjOaMycX1VyMshD0tRmfWqC7xJ0MMT8oO0='}

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,55000,gAAAAABpJYQM62ZaUt4UNCaYUJIDT_S5Dfv4EIlfWkbQ-T...
1,2,Bob,bob@example.com,Engineering,72000,gAAAAABpJYQMw20EDl_2t5ZvPg7GJnMcs7l1LDBKgP82YM...
2,3,Charlie,charlie@example.com,Marketing,63000,gAAAAABpJYQMxVTmf0h7sSIYGzKjLnpg1eWBOIE242GgAt...
3,4,David,david@example.com,Finance,80000,gAAAAABpJYQMF0RYS_YklX4XwIh56ApJzltF0wreEfUT5z...
4,5,Eva,eva@example.com,Engineering,75000,gAAAAABpJYQMGwRJc6EvrmC0eHypTPozNZffh0H2thafET...
5,6,Frank,frank@example.com,HR,50679,gAAAAABpJYQMWF8maWhi-xyDhwFGAhl91semUruYJ1FGGG...
6,7,Grace,grace@example.com,Sales,98115,gAAAAABpJYQM8ZcDPvRPXQ02yoJRaUzkYOspLOO9lm77os...
7,8,Hannah,hannah@example.com,Finance,87984,gAAAAABpJYQMpiPdBtYB0F-WBB1wmh7wGJdQX-O1_doWAf...
8,9,Ian,ian@example.com,Marketing,118777,gAAAAABpJYQMvHPUiWQQMs3Zf8_fIoDEkkoWJcbXAY00Yu...
9,10,Julia,julia@example.com,Sales,69355,gAAAAABpJYQMFJRhnRSAT6Ch9PKnJJhRPtoTnD13miAJFn...


## Test with Admin Role

In [12]:
key = get_key(ADMIN)
print("Retrieved key for HR:", key)
file = retrieve_data(BUCKET_NAME, SALARY_ENCRYPTED_FILE_NAME)
df = decrypt_salary_with_key(key["fernet_key"], file, "Salary")
decrypt_password_with_key(key["admin_key"], df, "Password")

Retrieved key for HR: {'fernet_key': 'U1eIY6p4bKjOaMycX1VyMshD0tRmfWqC7xJ0MMT8oO0=', 'admin_key': 'RFlaW9OgOasL0R_aXLSvAB6fuWk2ko6Cgl55UDM1UxQ='}

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,55000,2o1e*OBZWW^7Kd
1,2,Bob,bob@example.com,Engineering,72000,GSxZX$zD5Qlf8^
2,3,Charlie,charlie@example.com,Marketing,63000,02QChWYlypvP#i
3,4,David,david@example.com,Finance,80000,m5AKDGR88&T*A8
4,5,Eva,eva@example.com,Engineering,75000,Yt%hUAUT2RX5$A
5,6,Frank,frank@example.com,HR,50679,m@v!lQRysTxUex
6,7,Grace,grace@example.com,Sales,98115,GInHrHc@DFbAfu
7,8,Hannah,hannah@example.com,Finance,87984,pwLBZ6k6n7Pqpd
8,9,Ian,ian@example.com,Marketing,118777,jJZ0Lhlx#q!dK4
9,10,Julia,julia@example.com,Sales,69355,Xn#hz%AMIiwf2b


# Experiment 3 – Simulate Data Acces Layer (DAL)

Experiment simulating Data Acess Layer by:

- Creating a dummy class accessed by: `dal://...`
- The class should:
  - Uniformly enforces RBAC + Key Management
  - Decrypts Parquet columns based on role
  - Returns the parquet files with columns either decrypted/encrypted

We simulate what a DAL *would* look like in Python,
and then show why that doesn't qualify as a real, shared DAL.

        #Token = get_token
        #client.get_data(PATH, TOKEN)
        #role = data_access_layer.authenticate_user(TOKEN)
        #key = kms.getKey(role)
        #data = retrieve_data(PATH)
        #decryptdata = decryptParquet(key, data)
        #dal.return to client

In [13]:
class DataAccessLayerClient:
    """
    Hypothetical DAL client

    In the ideal world, both Snowflake and Python would effectively
    call this logic whenever they read data from S3.
    """
    def __init__(self, base_url):
        self.base_url = base_url
        
    def authenticate_user(self, token):
        print("Authenticating user with token:", token)
        """
        Simulates user authentication.
        In a real DAL, this would verify the token and return user info.
        """
        if not token:
            raise Exception("Authentication failed: No token provided")
        elif token == "Engineer_token":
            return {"user_id": "engineer_user", "roles": ["RBAC_IN_DATA_LAKES_ROLE_READ_ONLY"]}
        elif token == "HR_token":
            return {"user_id": "hr_user", "roles": ["RBAC_IN_DATA_LAKES_ROLE_SECURE_ANALYST"]}
        elif token == "Admin_token":
            return {"user_id": "admin_user", "roles": ["RBAC_IN_DATA_LAKES_ROLE_ADMIN"]}
        else:
            raise Exception("Authentication failed: Invalid token")

    
    def get_data(self, bucket_path, file_name, Token):
        """
        Simulates reading a Parquet file with DAL logic. That is, it simulates the above behaviour:
        - authenticate the user
        - check RBAC/KMS policy
        - fetch Parquet from underlying bucket and file name
        - decrypt protected columns if allowed
        - return a DataFrame or bytes of a Parquet file
        """
        ### AUTHENTICATION GOES HERE - NOT IMPLEMENTED ###
        #get_data(PATH, TOKEN)
        role = self.authenticate_user(Token)["roles"][0]
        
        #getKey returns key or None
        key = get_key(role)
        
        #data = retrive data file
        df = retrieve_data(bucket_path, file_name)
        #data = decryptParquet(key, data)
        if key is not None:
            df = decrypt_salary_with_key(key["fernet_key"], df, "Salary")
        if role == ADMIN:
            df = decrypt_password_with_key(key["admin_key"], df, "Password")
    
        return df
        

In [14]:
dal = DataAccessLayerClient(base_url="dal://rbac-sensitive/")

In [15]:
print("Raw dataframe read directly from S3:")
display(encrypted_data_df.head())

Raw dataframe read directly from S3:


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,gAAAAABpJYQMWuZVcz-2L9J5mFFa17-hOB0XlpiblNur_b...,gAAAAABpJYQM62ZaUt4UNCaYUJIDT_S5Dfv4EIlfWkbQ-T...
1,2,Bob,bob@example.com,Engineering,gAAAAABpJYQMXftaVhfgBaCGPb0vmvjyH5XEGtUh62WZGj...,gAAAAABpJYQMw20EDl_2t5ZvPg7GJnMcs7l1LDBKgP82YM...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpJYQMWoxHKRJIuARtkAwyYUk46TOOOGOVqBIEer...,gAAAAABpJYQMxVTmf0h7sSIYGzKjLnpg1eWBOIE242GgAt...
3,4,David,david@example.com,Finance,gAAAAABpJYQMpMukaf4SbqijwKfAbILawjhwkMMe9_sApb...,gAAAAABpJYQMF0RYS_YklX4XwIh56ApJzltF0wreEfUT5z...
4,5,Eva,eva@example.com,Engineering,gAAAAABpJYQMp7xA0Q1_VkL2ODSIpl2PCcPYE4ljF252rF...,gAAAAABpJYQMGwRJc6EvrmC0eHypTPozNZffh0H2thafET...


In [16]:
Token = "Engineer_token"
data = dal.get_data(bucket_path = BUCKET_NAME, file_name = SALARY_ENCRYPTED_FILE_NAME, Token=Token)
display(data.head())

Authenticating user with token: Engineer_token

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,gAAAAABpJYQMWuZVcz-2L9J5mFFa17-hOB0XlpiblNur_b...,gAAAAABpJYQM62ZaUt4UNCaYUJIDT_S5Dfv4EIlfWkbQ-T...
1,2,Bob,bob@example.com,Engineering,gAAAAABpJYQMXftaVhfgBaCGPb0vmvjyH5XEGtUh62WZGj...,gAAAAABpJYQMw20EDl_2t5ZvPg7GJnMcs7l1LDBKgP82YM...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpJYQMWoxHKRJIuARtkAwyYUk46TOOOGOVqBIEer...,gAAAAABpJYQMxVTmf0h7sSIYGzKjLnpg1eWBOIE242GgAt...
3,4,David,david@example.com,Finance,gAAAAABpJYQMpMukaf4SbqijwKfAbILawjhwkMMe9_sApb...,gAAAAABpJYQMF0RYS_YklX4XwIh56ApJzltF0wreEfUT5z...
4,5,Eva,eva@example.com,Engineering,gAAAAABpJYQMp7xA0Q1_VkL2ODSIpl2PCcPYE4ljF252rF...,gAAAAABpJYQMGwRJc6EvrmC0eHypTPozNZffh0H2thafET...


In [17]:
Token = "HR_token"
data = dal.get_data(bucket_path = BUCKET_NAME, file_name = SALARY_ENCRYPTED_FILE_NAME, Token=Token)
display(data.head())


Authenticating user with token: HR_token

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,55000,gAAAAABpJYQM62ZaUt4UNCaYUJIDT_S5Dfv4EIlfWkbQ-T...
1,2,Bob,bob@example.com,Engineering,72000,gAAAAABpJYQMw20EDl_2t5ZvPg7GJnMcs7l1LDBKgP82YM...
2,3,Charlie,charlie@example.com,Marketing,63000,gAAAAABpJYQMxVTmf0h7sSIYGzKjLnpg1eWBOIE242GgAt...
3,4,David,david@example.com,Finance,80000,gAAAAABpJYQMF0RYS_YklX4XwIh56ApJzltF0wreEfUT5z...
4,5,Eva,eva@example.com,Engineering,75000,gAAAAABpJYQMGwRJc6EvrmC0eHypTPozNZffh0H2thafET...


In [18]:
Token = "Admin_token"
data = dal.get_data(bucket_path = BUCKET_NAME, file_name = SALARY_ENCRYPTED_FILE_NAME, Token=Token)
display(data.head())

Authenticating user with token: Admin_token

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,ID,Name,Email,Department,Salary,Password
0,1,Alice,alice@example.com,HR,55000,2o1e*OBZWW^7Kd
1,2,Bob,bob@example.com,Engineering,72000,GSxZX$zD5Qlf8^
2,3,Charlie,charlie@example.com,Marketing,63000,02QChWYlypvP#i
3,4,David,david@example.com,Finance,80000,m5AKDGR88&T*A8
4,5,Eva,eva@example.com,Engineering,75000,Yt%hUAUT2RX5$A
