In [5]:
import numpy as np
import pandas as pd

class Provenance:
    def __init__(self):
        self.provenance_data = {}
        self.attribute_provenance = {}

    def capture_vertical_reduction(self, din, dout, retained_columns):
        n = len(din)
        m = len(dout)

        # Capture the bitset for retained columns
        bitset = [1 if col in retained_columns else 0 for col in din.columns]

        # Create the provenance tensor (identity matrix)
        T = np.eye(m, n, dtype=int)

        # Store the provenance tensor and attribute-based provenance
        self.provenance_data["vertical_reduction"] = T
        self.attribute_provenance["vertical_reduction"] = bitset

        return T



    def get_provenance(self, operation_key):
        return self.provenance_data.get(operation_key, None)

    def get_attribute_provenance(self, operation_key):
        return self.attribute_provenance.get(operation_key, None)

    def __str__(self):
        return str(self.provenance_data)


# Example usage

# Creating the Provenance instance
prov = Provenance()

# Input DataFrame
df_din = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

# Output DataFrame after vertical reduction (removing column 'B')
df_dout = df_din[['A', 'C']]

# Capture the provenance for vertical reduction
prov.capture_vertical_reduction(df_din, df_dout, retained_columns=['A', 'C'])

# Retrieve and print the provenance tensor and attribute provenance
print("Provenance Tensor for Vertical Reduction:")
print(prov.get_provenance("vertical_reduction"))

print("\nAttribute Provenance for Vertical Reduction:")
print(prov.get_attribute_provenance("vertical_reduction"))


Provenance Tensor for Vertical Reduction:
[[1 0 0]
 [0 1 0]
 [0 0 1]]

Attribute Provenance for Vertical Reduction:
[1, 0, 1]


In [6]:
import numpy as np
import pandas as pd

class Provenance:
    def __init__(self):
        self.provenance_data = {}

    def capture_vertical_augmentation(self, din, dout):

        # Number of columns in dout
        num_columns_dout = len(dout.columns)

        # Initialize the output bitset with zeros
        output_bitset = np.zeros(num_columns_dout, dtype=int)

        # Identify new columns in dout that are not in din
        new_columns = dout.columns.difference(din.columns)

        # Set the corresponding bits in the output bitset to 1 for new columns
        for new_col in new_columns:
            output_index = dout.columns.get_loc(new_col)
            output_bitset[output_index] = 1

        num_records_din = len(din)
        num_records_dout = len(dout)

        provenance_tensor = np.eye(num_records_dout, num_records_din, dtype=int)

        # Store the provenance information
        self.provenance_data['vertical_augmentation'] = {
            'record_tensor': provenance_tensor,
            'output_bitset': output_bitset
        }

        return provenance_tensor, output_bitset

In [7]:
din = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

# Perform vertical augmentation on din to create new columns
dout = din.copy()
dout['D'] = din['A'] + din['C']  # New column 'D' is created as the sum of columns 'A' and 'C'
dout['E'] = din['A'] + din['C'] + din['B']  # New column 'E' is created as the sum of columns 'A', 'C', and 'B'

prov = Provenance()
provenance_tensor, new_column_bitset = prov.capture_vertical_augmentation(din, dout)

print("Provenance Tensor:\n", provenance_tensor)
print("Bitset for New Columns:", new_column_bitset)

Provenance Tensor:
 [[1 0 0]
 [0 1 0]
 [0 0 1]]
Bitset for New Columns: [0 0 0 1 1]


In [8]:
import pandas as pd
import numpy as np

class Provenance:
    def __init__(self):
        self.provenance_data = {}  # Ensure the provenance_data dictionary is initialized

    def hash_row(self, row):
        """Generate a unique hash for a row."""
        return hash(tuple(row))

    def generate_hashed_df(self, df):
        """Generate a DataFrame with unique hashes for each row."""
        hashed_df = df.copy()
        hashed_df['hash'] = df.apply(self.hash_row, axis=1)
        return hashed_df

    def capture_join(self, dl, dr, do, join_condition):
        # Step 1: Hash the input DataFrames and the result DataFrame
        df_left_hashed = self.generate_hashed_df(dl)
        df_right_hashed = self.generate_hashed_df(dr)
        df_result_hashed = self.generate_hashed_df(do)

        # Step 2: Project the result DataFrame onto the columns of the input DataFrames
        df_left_proj = df_result_hashed[dl.columns].copy()
        df_right_proj = df_result_hashed[dr.columns].copy()

        # Step 3: Generate hashes for the projections
        df_left_proj['hash_proj'] = df_left_proj.apply(self.hash_row, axis=1)
        df_right_proj['hash_proj'] = df_right_proj.apply(self.hash_row, axis=1)

        # Step 4: Map the hashed projections to their indices
        n_result = len(do)
        n_left = len(dl)
        n_right = len(dr)
        tensor_prov = np.zeros((n_result, n_left, n_right), dtype=int)

        # Using pandas to match indices
        left_index_map = df_left_hashed.set_index('hash').index
        right_index_map = df_right_hashed.set_index('hash').index

        left_indices = df_left_proj['hash_proj'].map(left_index_map.get_loc)
        right_indices = df_right_proj['hash_proj'].map(right_index_map.get_loc)

        # Fill the provenance tensor
        tensor_prov[np.arange(n_result), left_indices, right_indices] = 1

        # Automatically generate bitsets based on column names
        num_attributes_do = len(do.columns)
        bitset_dl = np.zeros(num_attributes_do, dtype=int)
        bitset_dr = np.zeros(num_attributes_do, dtype=int)

        for i, col in enumerate(do.columns):
            if col in dl.columns:
                bitset_dl[i] = 1
            if col in dr.columns:
                bitset_dr[i] = 1

        # Store the provenance data
        self.provenance_data['join'] = {
            'record_tensor': tensor_prov,
            'bitset_dl': bitset_dl,
            'bitset_dr': bitset_dr
        }

        return tensor_prov, bitset_dl, bitset_dr

In [9]:
# Example usage
df_left = pd.DataFrame({
    'ID': [1, 2, 3],
    'Value_Left': ['A', 'B', 'C']
})

df_right = pd.DataFrame({
    'ID': [2, 3, 4],
    'Value_Right': ['D', 'E', 'F']
})

df_result = pd.merge(df_left, df_right, on='ID')

# Instantiate the Provenance object
prov = Provenance()

# Capture the join provenance
tensor_prov, bitset_dl, bitset_dr = prov.capture_join(df_left, df_right, df_result, 'ID')

# Display the results
print("Provenance Tensor:")
print(tensor_prov)
print("\nBitset for Left DataFrame:")
print(bitset_dl)
print("\nBitset for Right DataFrame:")
print(bitset_dr)

Provenance Tensor:
[[[0 0 0]
  [1 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 1 0]]]

Bitset for Left DataFrame:
[1 1 0]

Bitset for Right DataFrame:
[1 0 1]
