In [12]:
from IPython.core.display import HTML
HTML(r"""
<style>
    * {
        font-family: monospace;
        font-size: 12px;
        line-height: normal;
    }
</style>
""")

# Import this

In [13]:
import base64
from datetime import timedelta
import requests

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.parquet.encryption as pe
from pyarrow.tests.parquet.encryption import InMemoryKmsClient

In [14]:
# pandas dataframe

data = {
    'name': ["john", "bob", "alice"],
    'birth_day': ["1990-01-01", "1990-01-01", "1990-01-01"],
    'favorite_color': ["red", "purple", "navy"]
}
df = pd.DataFrame(data=data)
print(df.head())
table = pa.Table.from_pandas(df)

    name   birth_day favorite_color
0   john  1990-01-01            red
1    bob  1990-01-01         purple
2  alice  1990-01-01           navy


# KMS Client

## Mock KMS Client

In [15]:
# secret 128-bit AES key
footer_key = b"1234567890123450"
pii1_key = b"2345678901234501"
pii2_key = b"3456789012345012"

kms_connection_config = pe.KmsConnectionConfig(
    kms_instance_id="python_client",
    kms_instance_url="URL1",
    key_access_token="MyToken",
    custom_kms_conf={
        "parquet_footer_key": footer_key.decode("UTF-8"),
        "Confidential,Encrypt,PII1": pii1_key.decode("UTF-8"),
        "Confidential,Encrypt,PII2": pii2_key.decode("UTF-8"),
    }
)

def kms_factory(kms_connection_configuration):
    return InMemoryKmsClient(kms_connection_configuration)

## KMS Implementation

In [10]:
class VaultClient(pe.KmsClient):
    """An example of a KmsClient implementation with master keys
    managed by Hashicorp Vault KMS.
    See Vault documentation: https://www.vaultproject.io/api/secret/transit
    Not for production use!
    """
    JSON_MEDIA_TYPE = "application/json; charset=utf-8"
    DEFAULT_TRANSIT_ENGINE = "/v1/transit/"
    WRAP_ENDPOINT = "encrypt/"
    UNWRAP_ENDPOINT = "decrypt/"
    TOKEN_HEADER = "X-Vault-Token"

    def __init__(self, kms_connection_config):
        """Create a VaultClient instance.

        Parameters
        ----------
        kms_connection_config : KmsConnectionConfig
           configuration parameters to connect to vault,
           e.g. URL and access token
        """
        pe.KmsClient.__init__(self)
        self.kms_url = kms_connection_config.kms_instance_url + \
            VaultClient.DEFAULT_TRANSIT_ENGINE
        self.kms_connection_config = kms_connection_config

    def wrap_key(self, key_bytes, master_key_identifier):
        """Call Vault to wrap key key_bytes with key
        identified by master_key_identifier."""
        endpoint = self.kms_url + VaultClient.WRAP_ENDPOINT
        headers = {VaultClient.TOKEN_HEADER:
                   self.kms_connection_config.key_access_token}
        r = requests.post(endpoint + master_key_identifier,
                          headers=headers,
                          data={'plaintext': base64.b64encode(key_bytes)})
        r.raise_for_status()
        r_dict = r.json()
        wrapped_key = r_dict['data']['ciphertext']
        return wrapped_key

    def unwrap_key(self, wrapped_key, master_key_identifier):
        """Call Vault to unwrap wrapped_key with key
        identified by master_key_identifier"""
        endpoint = self.kms_url + VaultClient.UNWRAP_ENDPOINT
        headers = {VaultClient.TOKEN_HEADER:
                   self.kms_connection_config.key_access_token}
        r = requests.post(endpoint + master_key_identifier,
                          headers=headers,
                          data={'ciphertext': wrapped_key})
        r.raise_for_status()
        r_dict = r.json()
        plaintext = r_dict['data']['plaintext']
        key_bytes = base64.b64decode(plaintext)
        return key_bytes

In [None]:
kms_connection_config = pe.KmsConnectionConfig(
    kms_instance_url="https://datainfra-dp-masking-api.zalopay.vn/",
    key_access_token="30338e4118d17773515f9e11d0fcf415ff5d344d",
)

def kms_factory(kms_connection_configuration):
    return VaultClient(kms_connection_configuration)

# Encryption

In [16]:
encryption_config = pe.EncryptionConfiguration(
    footer_key="parquet_footer_key",
    encryption_algorithm="AES_GCM_CTR_V1",
    plaintext_footer=True,
    double_wrapping=True,
    column_keys={
       "Confidential,Encrypt,PII1": ["name"],
       "Confidential,Encrypt,PII2": ["birth_day"],
    },
    cache_lifetime=timedelta(minutes=10.0),
    data_key_length_bits=128,)

In [17]:
crypto_factory = pe.CryptoFactory(kms_factory)
file_encryption_properties = crypto_factory.file_encryption_properties(
    kms_connection_config, encryption_config)

In [18]:
with pq.ParquetWriter(
    "./data/enrypted_data1.parquet",
    table.schema,
    encryption_properties=file_encryption_properties
) as writer:
    writer.write_table(table)

In [36]:
# Parquet file with plain text footer
print(pq.ParquetFile(source="./data/enrypted_data.parquet").metadata)
print(pq.ParquetFile(source="./data/enrypted_data.parquet").schema)

<pyarrow._parquet.FileMetaData object at 0x16a729800>
  created_by: parquet-cpp-arrow version 16.0.0
  num_columns: 3
  num_rows: 3
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 3543
<pyarrow._parquet.ParquetSchema object at 0x111965580>
required group field_id=-1 schema {
  optional binary field_id=-1 name (String);
  optional binary field_id=-1 birth_day (String);
  optional binary field_id=-1 favorite_color (String);
}



In [37]:
# Parquet file with encrypted footer
print(pq.ParquetFile(source="./enrypted_data2.parquet").metadata)
print(pq.ParquetFile(source="./enrypted_data2.parquet").schema)

OSError: Could not read encrypted metadata, no decryption found in reader's properties

# Decryption

In [40]:
decryption_config = pe.DecryptionConfiguration(
    cache_lifetime=timedelta(minutes=5.0))
file_decryption_properties = crypto_factory.file_decryption_properties(
        kms_connection_config, decryption_config)

In [45]:
file_path = "./enrypted_data2.parquet"
result = pq.ParquetFile(
    source=file_path,
    decryption_properties=file_decryption_properties)

In [51]:
result.schema

<pyarrow._parquet.ParquetSchema object at 0x173008680>
required group field_id=-1 schema {
  optional binary field_id=-1 name (String);
  optional binary field_id=-1 birth_day (String);
  optional binary field_id=-1 favorite_color (String);
}

In [52]:
result.read().to_pandas()

Unnamed: 0,name,birth_day,favorite_color
0,john,1990-01-01,red
1,bob,1990-01-01,purple
2,alice,1990-01-01,navy
