# Experiment 1 – Show Encrypted Data Cannot Be Read

**Goal:** Demonstrate that encrypted data stored in an AWS S3 bucket cannot be read
without the appropriate decryption key / permissions.

We will:

1. List objects in the test S3 bucket.
2. Attempt to read a Parquet file with and *encrypted* column
3. Attempt to read the same encrypted data via Snowflake external stage.


In [1]:
# Imports
import io
import boto3
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from cryptography.fernet import Fernet
from io import BytesIO
from botocore.exceptions import NoCredentialsError, ClientError
from dotenv import load_dotenv
import snowflake.connector
import os

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
BUCKET_NAME = "rher-s3-test-bucket"
INPUT_FILE = "sample_sensitive_data.parquet"
OUTPUT_FILE_LOCATION = "sample_sensitive_data_encrypted.parquet"
COLUMN_TO_ENCRYPT = "salary" 
FERNET_KEY = os.getenv("FERNET_KEY")

## Utility Functions

In [4]:
def list_s3_objects(bucket_name):
    try:
        s3 = boto3.client("s3")
        response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' not in response:
            print(f"No objects found in bucket: {bucket_name}")
            return []
        print(f"Objects in bucket '{bucket_name}':")
        for obj in response['Contents']:
            print(f" - {obj['Key']} (LastModified: {obj['LastModified']}, Size: {obj['Size']} bytes)")
        return [obj['Key'] for obj in response['Contents']]

    except NoCredentialsError:
        print("AWS credentials not found. Please configure them first.")
        return []
    except ClientError as e:
        print(f" AWS Client Error: {e}")
        return []
    except Exception as e:
        print(f" Unexpected error: {e}")
        return []

def read_s3_parquet(bucket_name, file_name):
    """Read a Parquet file from S3"""
    s3 = boto3.client("s3")
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    data = BytesIO(response['Body'].read())
    df = pd.read_parquet(data, engine="pyarrow")
    print(f"\n successfully loaded '{file_name}' into DataFrame.")
    return df

def encrypt_column_in_parquet(bucket_name, input_file, output_file_location, column_to_encrypt, fernet_key):
    """Encrypt a specific column in a Parquet file and upload the result to S3."""
    fernet = Fernet(fernet_key)
    s3 = boto3.client("s3")
    buffer = io.BytesIO()
    s3.download_fileobj(bucket_name, input_file, buffer)
    buffer.seek(0)
    table = pq.read_table(buffer)
    df = table.to_pandas()
    print(f"Encrypting column: {column_to_encrypt}")
    if column_to_encrypt in df.columns:
        df[column_to_encrypt] = df[column_to_encrypt].astype(str).apply(
            lambda x: fernet.encrypt(x.encode()).decode()
        )
    else:
        raise ValueError(f"Column '{column_to_encrypt}' not found in Parquet file")

    output_buffer = io.BytesIO()
    pq.write_table(pa.Table.from_pandas(df), output_buffer)
    output_buffer.seek(0)
    print("Uploading encrypted Parquet file to S3...")
    s3.upload_fileobj(output_buffer, bucket_name, output_file_location)

    print("Done!")
    print(f"Encrypted file uploaded to s3://{bucket_name}/{output_file_location}")


## Encrypt salary column in file

In [5]:

encrypt_column_in_parquet(BUCKET_NAME, INPUT_FILE, OUTPUT_FILE_LOCATION, COLUMN_TO_ENCRYPT, FERNET_KEY)

Encrypting column: salary
Uploading encrypted Parquet file to S3...
Done!
Encrypted file uploaded to s3://rher-s3-test-bucket/sample_sensitive_data_encrypted.parquet


## List all .parquet fiels in bucket

In [6]:
all_keys = list_s3_objects(BUCKET_NAME)

all_parquet_files = [k for k in all_keys if k.endswith(".parquet")]
print("\nParquet objects found:")
for k in all_parquet_files:
    print(" -", k)

encrypted_candidates = [k for k in all_parquet_files if "encrypted" in k.lower()]

print("\nEncrypted candidates:")
for k in encrypted_candidates:
    print(" -", k)


Objects in bucket 'rher-s3-test-bucket':
 - sample_sensitive_data.parquet (LastModified: 2025-11-03 11:16:22+00:00, Size: 2152 bytes)
 - sample_sensitive_data_encrypted.parquet (LastModified: 2025-11-18 12:07:52+00:00, Size: 5303 bytes)

Parquet objects found:
 - sample_sensitive_data.parquet
 - sample_sensitive_data_encrypted.parquet

Encrypted candidates:
 - sample_sensitive_data_encrypted.parquet


In [7]:
raw_data_df = read_s3_parquet(BUCKET_NAME, all_parquet_files[0])
encrypted_data_df = read_s3_parquet(BUCKET_NAME, encrypted_candidates[0])
print(f"DataFrame shape: {raw_data_df.shape}")
print(f"DataFrame shape: {encrypted_data_df.shape}")


 successfully loaded 'sample_sensitive_data.parquet' into DataFrame.

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.
DataFrame shape: (10, 5)
DataFrame shape: (10, 5)


In [8]:
raw_data_df.head()

Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,55000
1,2,Bob,bob@example.com,Engineering,72000
2,3,Charlie,charlie@example.com,Marketing,63000
3,4,David,david@example.com,Finance,80000
4,5,Eva,eva@example.com,Engineering,75000


In [9]:
encrypted_data_df.head()

Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,gAAAAABpHGGVvpc810HZWYivK2JSp_Y7vUgT4EHx9W9dga...
1,2,Bob,bob@example.com,Engineering,gAAAAABpHGGVDM3bsX8rI3kknYwOXiKEkyrJK0IuxdyYey...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpHGGVqPZyFggXWj72qJOkNulmy9CfzxT9LCRg_3...
3,4,David,david@example.com,Finance,gAAAAABpHGGVwBlrssRuExSqHJkU9fICKWvsepaQTlquRc...
4,5,Eva,eva@example.com,Engineering,gAAAAABpHGGV7YrpwWkie3y7zVZo15xpUwQidKjYubl7tT...


## Try to run with snowflake integration

In [10]:
SNOWFLAKE_ACCOUNT   = os.getenv("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_USER      = os.getenv("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD  = os.getenv("SNOWFLAKE_PASSWORD")
SNOWFLAKE_ROLE      = os.getenv("SNOWFLAKE_ROLE")
SNOWFLAKE_WAREHOUSE = os.getenv("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE  = os.getenv("SNOWFLAKE_DATABASE")
SNOWFLAKE_SCHEMA    = os.getenv("SNOWFLAKE_SCHEMA")

STAGE_NAME = "s3_test_stage"          
EXT_TABLE  = "EMPLOYEES_SALARY_ENCRYPTED" 

snowflake_conn = snowflake.connector.connect(
    account=SNOWFLAKE_ACCOUNT,
    user=SNOWFLAKE_USER,
    password=SNOWFLAKE_PASSWORD,
    passcode=863976, ## UPDATE THIS!!
    role=SNOWFLAKE_ROLE,
    warehouse=SNOWFLAKE_WAREHOUSE,
    database=SNOWFLAKE_DATABASE,
    schema=SNOWFLAKE_SCHEMA,
)

DatabaseError: 250001 (08001): Failed to connect to DB: XBB95219.snowflakecomputing.com:443. TOTP Invalid. You have 4 attempts remaining before the user will be blocked from using authenticator apps for a few minutes.

In [11]:
print("=== Experiment 1 (Snowflake) – Attempt to query encrypted data ===")

try:
    with snowflake_conn.cursor() as cur:
        sql = f"SELECT * FROM {EXT_TABLE} LIMIT 10"
        print("Running:", sql)
        cur.execute(sql)
        rows = cur.fetchall()
        print("\nQuery returned rows:")
        for r in rows:
            print(r)

        if not rows:
            print("\nRESULT: No rows returned. Check if the external table could not "
                  "access or parse the encrypted S3 objects.")
        else:
            print("\nRESULT: Rows returned. Inspect if data is readable vs encrypted.")
except Exception as e:
    print("\nSnowflake query failed – this is expected if encryption / KMS permissions prevent access to the S3 objects.")
    print("Error:", e)
finally:
    snowflake_conn.close()


=== Experiment 1 (Snowflake) – Attempt to query encrypted data ===

Snowflake query failed – this is expected if encryption / KMS permissions prevent access to the S3 objects.
Error: name 'snowflake_conn' is not defined


NameError: name 'snowflake_conn' is not defined

# Experiment 2 – Simulate Key Management Service + RBAC

In [19]:
READ_ONLY_ROLE = "RBAC_IN_DATA_LAKES_ROLE_READ_ONLY"
SECURE_ROLE    = "RBAC_IN_DATA_LAKES_ROLE_SECURE_ANALYST"

def decrypt_salary(encrypted_column):
    """Real decrypt (KMS primitive). Only this function knows the key."""
    decrypted_bytes = Fernet(FERNET_KEY).decrypt(encrypted_column.encode())
    return int(decrypted_bytes.decode())

def simulate_kms(df, encrypted_column, role):
    if role == SECURE_ROLE:
        df = df.copy()
        df[encrypted_column] = df[encrypted_column].apply(
            lambda s: decrypt_salary(s)
        )
        return df
    else:
        return df
    

## Test with Read-only Role

In [20]:
simulate_kms(encrypted_data_df, "salary", READ_ONLY_ROLE).head()

Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,gAAAAABpHGGVvpc810HZWYivK2JSp_Y7vUgT4EHx9W9dga...
1,2,Bob,bob@example.com,Engineering,gAAAAABpHGGVDM3bsX8rI3kknYwOXiKEkyrJK0IuxdyYey...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpHGGVqPZyFggXWj72qJOkNulmy9CfzxT9LCRg_3...
3,4,David,david@example.com,Finance,gAAAAABpHGGVwBlrssRuExSqHJkU9fICKWvsepaQTlquRc...
4,5,Eva,eva@example.com,Engineering,gAAAAABpHGGV7YrpwWkie3y7zVZo15xpUwQidKjYubl7tT...


## Test with Secure Analyst Role

In [21]:
simulate_kms(encrypted_data_df, "salary", SECURE_ROLE).head()

Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,55000
1,2,Bob,bob@example.com,Engineering,72000
2,3,Charlie,charlie@example.com,Marketing,63000
3,4,David,david@example.com,Finance,80000
4,5,Eva,eva@example.com,Engineering,75000


# Experiment 3 – Simulate Data Acces Layer (DAL)

Experiment simulating Data Acess Layer by:

- Creating a dummy class accessed by: `dal://...`
- The class should:
  - Uniformly enforces RBAC + Key Management
  - Decrypts Parquet columns based on role
  - Returns the parquet files with columns either decrypted/encrypted

We simulate what a DAL *would* look like in Python,
and then show why that doesn't qualify as a real, shared DAL.


In [28]:
class DataAccessLayerClient:
    """
    Hypothetical DAL client

    In the ideal world, both Snowflake and Python would effectively
    call this logic whenever they read data from S3.
    """
    def __init__(self, base_url):
        self.base_url = base_url

    def read_parquet_dal(self, bucket_name, file_name, column, role):
        """
        In our dream world, this would:
        - authenticate the user
        - check RBAC/KMS policy
        - fetch Parquet from underlying S3
        - decrypt protected columns if allowed
        - return a DataFrame or bytes of a Parquet file
        """
        raise NotImplementedError("There is no DAL service backing 'dal://' yet")
    
    def read_parquet_simulate(self, bucket_name, file_name, column, role):
        """
        Simulates reading a Parquet file with DAL logic. That is, it simulates the above behaviour:
        - authenticate the user
        - check RBAC/KMS policy
        - fetch Parquet from underlying bucket and file name
        - decrypt protected columns if allowed
        - return a DataFrame or bytes of a Parquet file
        """
        ### AUTHENTICATION GOES HERE - NOT IMPLEMENTED ###
        df = read_s3_parquet(bucket_name, file_name)
        return simulate_kms(df, column, role)

In [29]:
dal = DataAccessLayerClient(base_url="dal://rbac-sensitive/")
dal.read_parquet_dal(bucket_name = BUCKET_NAME, file_name = OUTPUT_FILE_LOCATION, column="salary", role=SECURE_ROLE)

NotImplementedError: There is no DAL service backing 'dal://' yet

In [31]:
print("Raw dataframe read directly from S3:")
display(encrypted_data_df.head())

print("\nAs read_only (no decryption):")
df_ro = dal.read_parquet_simulate(bucket_name = BUCKET_NAME, file_name = OUTPUT_FILE_LOCATION, column="salary", role=READ_ONLY_ROLE)
display(df_ro.head())

print("\nAs secure_analyst (salary decrypted):")
df_sa = dal.read_parquet_simulate(bucket_name = BUCKET_NAME, file_name = OUTPUT_FILE_LOCATION, column="salary", role=SECURE_ROLE)
display(df_sa.head())


Raw dataframe read directly from S3:


Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,gAAAAABpHGGVvpc810HZWYivK2JSp_Y7vUgT4EHx9W9dga...
1,2,Bob,bob@example.com,Engineering,gAAAAABpHGGVDM3bsX8rI3kknYwOXiKEkyrJK0IuxdyYey...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpHGGVqPZyFggXWj72qJOkNulmy9CfzxT9LCRg_3...
3,4,David,david@example.com,Finance,gAAAAABpHGGVwBlrssRuExSqHJkU9fICKWvsepaQTlquRc...
4,5,Eva,eva@example.com,Engineering,gAAAAABpHGGV7YrpwWkie3y7zVZo15xpUwQidKjYubl7tT...



As read_only (no decryption):

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,gAAAAABpHGGVvpc810HZWYivK2JSp_Y7vUgT4EHx9W9dga...
1,2,Bob,bob@example.com,Engineering,gAAAAABpHGGVDM3bsX8rI3kknYwOXiKEkyrJK0IuxdyYey...
2,3,Charlie,charlie@example.com,Marketing,gAAAAABpHGGVqPZyFggXWj72qJOkNulmy9CfzxT9LCRg_3...
3,4,David,david@example.com,Finance,gAAAAABpHGGVwBlrssRuExSqHJkU9fICKWvsepaQTlquRc...
4,5,Eva,eva@example.com,Engineering,gAAAAABpHGGV7YrpwWkie3y7zVZo15xpUwQidKjYubl7tT...



As secure_analyst (salary decrypted):

 successfully loaded 'sample_sensitive_data_encrypted.parquet' into DataFrame.


Unnamed: 0,id,name,email,department,salary
0,1,Alice,alice@example.com,HR,55000
1,2,Bob,bob@example.com,Engineering,72000
2,3,Charlie,charlie@example.com,Marketing,63000
3,4,David,david@example.com,Finance,80000
4,5,Eva,eva@example.com,Engineering,75000
