# Statistische Geheimhaltung: Cell Key Methode

Joshua Simon, Otto-Friedrich-University Bamberg, joshua-guenter.simon@stud.uni-bamberg.de

## Python Packages

In [57]:
import math
import numpy as np
import pandas as pd
from IPython.display import display

## 1 Test-Daten

### 1.1 Daten Generieren

In [58]:
def generate_data(n, seed):
    """ 
    Generates some random data from sample attributes. 
    Each row gets a random uniformly distributed record key
    between 0 and 1. 
    """
    np.random.seed(seed)
    universities = ["Bamberg", "Wuerzburg", "Muenchen", "Eichstaett"]
    sex = ["m", "w"]

    uni_data = np.random.choice(universities, size=n, replace=True, p=[0.15, 0.3, 0.5, 0.05])
    sex_data = np.random.choice(sex, size=n, replace=True, p=[0.5, 0.5])
    record_key_data = np.random.uniform(low=0.0, high=1.0, size=n)

    return pd.DataFrame(
        list(zip(uni_data, sex_data, record_key_data)),
        columns =['university', 'sex', 'record_key']
    )

In [59]:
data = generate_data(n=10, seed=42)
data.head(n=10)

Unnamed: 0,university,sex,record_key
0,Wuerzburg,m,0.611853
1,Eichstaett,w,0.139494
2,Muenchen,w,0.292145
3,Muenchen,m,0.366362
4,Wuerzburg,m,0.45607
5,Wuerzburg,m,0.785176
6,Bamberg,m,0.199674
7,Muenchen,w,0.514234
8,Muenchen,m,0.592415
9,Muenchen,m,0.04645


### 1.2 Daten tabellieren

In [60]:
def tabulate_data(data, rollout=False):
    """
    Generates the grouped frequency table with summed record keys.
    If the rollout option is true, all of the grouped sums are
    calculated as well.
    """
    grouped_data = data.groupby(["university", "sex"]).agg(["count", "sum"])
    grouped_data.columns = ["count", "record_key_sum"]
    grouped_data.reset_index(inplace=True)

    if rollout:
        rollout_data = data.loc[:, data.columns != "sex"].groupby(["university"]).agg(["count", "sum"])
        rollout_data.columns = ["count", "record_key_sum"]
        rollout_data.reset_index(inplace=True)
        rollout_data["sex"] = "i"
        rollout_data = rollout_data.iloc[:, [0,3,1,2]]

        sum_col = pd.DataFrame({
            "university": ["sum"],
            "sex": ["i"],
            "count": [grouped_data["count"].sum()],
            "record_key_sum": [grouped_data["record_key_sum"].sum()]
        })

        grouped_data = grouped_data.append([rollout_data, sum_col], ignore_index=True)
        grouped_data = grouped_data.sort_values(by=["university", "sex"])

    return grouped_data

In [61]:
table_data = tabulate_data(data)
table_data

Unnamed: 0,university,sex,count,record_key_sum
0,Bamberg,m,1,0.199674
1,Eichstaett,w,1,0.139494
2,Muenchen,m,3,1.005227
3,Muenchen,w,2,0.806379
4,Wuerzburg,m,3,1.853099


## 2 Geheimhaltung mittels CKM

### 2.1 Überlagerungsmatrix

Die Werte der Überlagerungsmatrix (im Code `OVERLAY_MATRIX` und `CHANGE_VECTOR`) wurden von "Die Cell-Key-Methode – ein Geheimhaltungsverfahren" von Jörg Höhne und Julia Höninger übernommen.

In [62]:
OVERLAY_MATRIX = np.matrix([
    [0, 0, 0, 0, 1, 1, 1, 1, 1],
    [0, 0, 0, 0.6875, 0.6875, 0.6875, 0.9375, 1, 1],
    [0, 0, 0.3533, 0.3533, 0.3533, 0.9440, 0.9970, 0.9990, 1],
    [0, 0.1620, 0.1620, 0.1620, 0.6620, 0.8560, 0.9970, 0.9990, 1],
    [0.0870, 0.0870, 0.0870, 0.1920, 0.6920, 0.8590, 0.9970, 0.9990, 1],
    [0, 0, 0.1450, 0.3270, 0.8270, 0.8590, 0.8930, 0.9490, 1],
    [0, 0.0400, 0.1500, 0.2850, 0.7850, 0.8600, 0.9200, 0.9600, 1],
    [0.0200, 0.0600, 0.1450, 0.2500, 0.7500, 0.8550, 0.9400, 0.9800, 1]
])

CHANGE_VECTOR = [-4, -3, -2, -1, 0, 1, 2, 3, 4]

### 2.2 Implementierung der CKM

Mittels der Überlagerungsmatrix werden die Überlagerungswerte für die eigentlichen Tabellenwerten bestimmt.

In [63]:
def get_cell_key(value: float) -> float:
    """ 
    Returns the decimal part of a floating point number. 
    """
    return value - int(value)


def get_len_of_int(value: int) -> int:
    """
    Returns the length (= number of digits) of an positive integer.
    """
    return int(math.log10(value)) + 1


def get_overlay_matrix_value(matrix, vector, values, record_key_sums, seed, p0=1) -> list:
    """
    Returns the overlay value given by the overlay matrix and vector
    for a value-record_key_sum-pair.
    The overlay value is determined by the value ifself and the floating
    point digits of the record_key_sum value. The value is used as a 
    row-index to find the row in the overlay matrix. If the value and 
    therefore the row-index is out of range, the last row of the matrix
    is used. In the selected row, the index of the column, where the 
    record_key_sum is bigger than the column value is then used as in index
    for the overlay vector. The selected value of this vector is the
    overlay value which is to add to the original table value. The probability
    p0 determines the chance, that the overlay value is actually used. Values
    equal to 0 are not changed.
    """
    np.random.seed(seed)
    overlay_col = []
    num_rows, _ = matrix.shape

    for value, record_key_sum in zip(values, record_key_sums):
        if value == 0:
            overlay_col.append(value)
            continue
        elif value < num_rows:
            cell_keys = matrix[value, :]
        else:
            cell_keys = matrix[num_rows - 1, :]

        for index, key in enumerate(cell_keys.tolist()[0]):
            if key > get_cell_key(record_key_sum):
                overlay_value = vector[index]
                break
        else:
            overlay_value = vector[-1]

        if p0 is not None:
            overlay_value = np.random.choice([overlay_value, 0], size=1, p=[1 - p0, p0])[0]
        overlay_col.append(overlay_value)

    return overlay_col

### 2.3 Anwendung auf das Test-Daten Beispiel

Zunächst mit der Bleibewahrscheinlichkeit $p_0 = 0$, d.h. alle Werte werde überlagert. Die Spalte `overlay_value` zeigt den aus der Überlagerungsmatrix abgeleiteten Wert, der auf die Originalwerte zu addieren ist.

In [64]:
overlay_col = get_overlay_matrix_value(
    OVERLAY_MATRIX, CHANGE_VECTOR, 
    table_data["count"], table_data["record_key_sum"], 
    seed=42, p0=0
)

result = table_data.copy()
result["overlay_value"] = overlay_col
result

Unnamed: 0,university,sex,count,record_key_sum,overlay_value
0,Bamberg,m,1,0.199674,-1
1,Eichstaett,w,1,0.139494,-1
2,Muenchen,m,3,1.005227,-3
3,Muenchen,w,2,0.806379,1
4,Wuerzburg,m,3,1.853099,1


Führt man obigen Code mit dem Parameter $p_0 = 1$ werden keine Werte überlagert. Der Überlagerungsvektor ist entsprechend der Null-Vektor.

In [65]:
overlay_col = get_overlay_matrix_value(
    OVERLAY_MATRIX, CHANGE_VECTOR, 
    table_data["count"], table_data["record_key_sum"], 
    seed=42, p0=1
)

result = table_data.copy()
result["overlay_value"] = overlay_col
result

Unnamed: 0,university,sex,count,record_key_sum,overlay_value
0,Bamberg,m,1,0.199674,0
1,Eichstaett,w,1,0.139494,0
2,Muenchen,m,3,1.005227,0
3,Muenchen,w,2,0.806379,0
4,Wuerzburg,m,3,1.853099,0


Neben diesen beiden Extremfällen sind natürlich beliebige Zwischenwerte für $p_0$ möglich, z.B. $p_0 = 0.45$. Es sei an dieser Stelle zu beachten, dass natürlich auch die Überlagungsmatrix den Wert $0$ liefern kann. Bei $p_0 = 0.45$ müssen also nicht zwangsläufig $45$% der Werte überlagert bzw. letztlich verändert werden.

In [66]:
overlay_col = get_overlay_matrix_value(
    OVERLAY_MATRIX, CHANGE_VECTOR, 
    table_data["count"], table_data["record_key_sum"], 
    seed=42, p0=0.45
)

result = table_data.copy()
result["overlay_value"] = overlay_col
result

Unnamed: 0,university,sex,count,record_key_sum,overlay_value
0,Bamberg,m,1,0.199674,-1
1,Eichstaett,w,1,0.139494,0
2,Muenchen,m,3,1.005227,0
3,Muenchen,w,2,0.806379,0
4,Wuerzburg,m,3,1.853099,1


Um das CKM Verfahren abzuschließen, müssen nur noch die Überlagerungswerte auf die Originalwerte addiert werden.

In [67]:
overlay_col = get_overlay_matrix_value(
    OVERLAY_MATRIX, CHANGE_VECTOR, 
    table_data["count"], table_data["record_key_sum"], 
    seed=42, p0=0
)

result = table_data.copy()
result["overlay_value"] = overlay_col
result["new_value"] = overlay_col + result["count"]
result

Unnamed: 0,university,sex,count,record_key_sum,overlay_value,new_value
0,Bamberg,m,1,0.199674,-1,0
1,Eichstaett,w,1,0.139494,-1,0
2,Muenchen,m,3,1.005227,-3,0
3,Muenchen,w,2,0.806379,1,3
4,Wuerzburg,m,3,1.853099,1,4


Auch dies lässt sich in einer Funktion wrappen.

In [68]:
def apply_ckm(data, matrix, vector, value_col_names, record_key_names, seed, p) -> pd.DataFrame:
    """
    Applys the Cell Key Method to the named columns of a data set.
    Therefore the overlay value is calculated and added to the named
    columns.
    Returns a DataFrame with the overlayed data.
    """
    output_data = data.copy()
    for col_name, record_key_name, p0 in zip(value_col_names, record_key_names, p):
        output_data[col_name] = data[col_name] + get_overlay_matrix_value(matrix, vector, data[col_name], data[record_key_name], seed, p0)
    return output_data

In [69]:
overlayed_data = apply_ckm(
    table_data, OVERLAY_MATRIX, CHANGE_VECTOR,
    ["count"], ["record_key_sum"], seed=42, p=[0.45]
)
overlayed_data

Unnamed: 0,university,sex,count,record_key_sum
0,Bamberg,m,0,0.199674
1,Eichstaett,w,1,0.139494
2,Muenchen,m,3,1.005227
3,Muenchen,w,2,0.806379
4,Wuerzburg,m,4,1.853099


## 3 Probleme bei der CKM


In [70]:
# Generate a bigger sample.
data = generate_data(1001, 42)
table_data = tabulate_data(data)
sum_table_data = table_data["count"].sum()

# Add sum column to dataframe.
df_col = pd.DataFrame({
    "university": ["sum"],
    "sex": ["i"],
    "count": [table_data["count"].sum()],
    "record_key_sum": [table_data["record_key_sum"].sum()]
})
table_data = table_data.append(df_col, ignore_index=True)

# Apply CKM.
overlayed_data = apply_ckm(
    table_data, OVERLAY_MATRIX, CHANGE_VECTOR,
    ["count"], ["record_key_sum"], seed=42, p=[0.45]
)
sum_overlayed_data = overlayed_data["count"][:-1].sum()

# Print results.
display(table_data)
print(f"Summe der unveränderten Daten = {sum_table_data}.")
display(overlayed_data)
print(f"Summe der veränderten Daten = {sum_overlayed_data}.")

Unnamed: 0,university,sex,count,record_key_sum
0,Bamberg,m,75,37.445005
1,Bamberg,w,91,44.41989
2,Eichstaett,m,17,7.962167
3,Eichstaett,w,29,12.958318
4,Muenchen,m,258,132.824413
5,Muenchen,w,235,119.962881
6,Wuerzburg,m,135,64.781587
7,Wuerzburg,w,161,83.262634
8,sum,i,1001,503.616894


Summe der unveränderten Daten = 1001.


Unnamed: 0,university,sex,count,record_key_sum
0,Bamberg,m,75,37.445005
1,Bamberg,w,91,44.41989
2,Eichstaett,m,17,7.962167
3,Eichstaett,w,29,12.958318
4,Muenchen,m,259,132.824413
5,Muenchen,w,238,119.962881
6,Wuerzburg,m,136,64.781587
7,Wuerzburg,w,161,83.262634
8,sum,i,1001,503.616894


Summe der veränderten Daten = 1006.


In [71]:
table_data2 = tabulate_data(data, rollout=True)
table_data2

Unnamed: 0,university,sex,count,record_key_sum
8,Bamberg,i,166,81.864894
0,Bamberg,m,75,37.445005
1,Bamberg,w,91,44.41989
9,Eichstaett,i,46,20.920485
2,Eichstaett,m,17,7.962167
3,Eichstaett,w,29,12.958318
10,Muenchen,i,493,252.787294
4,Muenchen,m,258,132.824413
5,Muenchen,w,235,119.962881
11,Wuerzburg,i,296,148.044221


In [72]:
overlayed_data2 = apply_ckm(
    table_data2, OVERLAY_MATRIX, CHANGE_VECTOR,
    ["count"], ["record_key_sum"], seed=42, p=[0]
)
overlayed_data2

Unnamed: 0,university,sex,count,record_key_sum
8,Bamberg,i,168,81.864894
0,Bamberg,m,75,37.445005
1,Bamberg,w,91,44.41989
9,Eichstaett,i,48,20.920485
2,Eichstaett,m,20,7.962167
3,Eichstaett,w,32,12.958318
10,Muenchen,i,494,252.787294
4,Muenchen,m,259,132.824413
5,Muenchen,w,238,119.962881
11,Wuerzburg,i,293,148.044221


In [78]:
overlayed_data2["original"] = table_data2["count"]
overlayed_data2.drop(columns=("record_key_sum"))

Unnamed: 0,university,sex,count,original
8,Bamberg,i,168,166
0,Bamberg,m,75,75
1,Bamberg,w,91,91
9,Eichstaett,i,48,46
2,Eichstaett,m,20,17
3,Eichstaett,w,32,29
10,Muenchen,i,494,493
4,Muenchen,m,259,258
5,Muenchen,w,238,235
11,Wuerzburg,i,293,296


In [73]:
sum_data = overlayed_data2["count"][overlayed_data2["sex"] == "i"]
print(f"Summe der einzelnen Insgesamt-Spalten = {sum_data.iloc[:-1].sum()}.")

Summe der einzelnen Insgesamt-Spalten = 1003.


In [74]:
print("Insgesamt-Werte aus überlagerten Insgesamt-Zeilen.")
display(table_data2[overlayed_data2["sex"] == "i"].iloc[:-1, [0, 2]])
print("Berechnete Insgesamt-Werte aus überlagerten Einzelpositionen.")
display(overlayed_data2[overlayed_data2["sex"] != "i"].groupby(["university"]).agg("sum").iloc[:, :-1])

Insgesamt-Werte aus überlagerten Insgesamt-Zeilen.


Unnamed: 0,university,count
8,Bamberg,166
9,Eichstaett,46
10,Muenchen,493
11,Wuerzburg,296


Berechnete Insgesamt-Werte aus überlagerten Einzelpositionen.


Unnamed: 0_level_0,count
university,Unnamed: 1_level_1
Bamberg,166
Eichstaett,52
Muenchen,497
Wuerzburg,297
