### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import math
import time
import os
import sys

### Importing the dataset

In [2]:
names = (
    "Sex",
    "Length",
    "Diameter",
    "Height",
    "Whole-weight",
    "Shucked-weight",
    "Viscera-weight",
    "Shell-weight",
    "Rings"
)
original_table = pd.read_csv("abalone.data", header=None, names=names)
original_table.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole-weight,Shucked-weight,Viscera-weight,Shell-weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
def reScale(rings):
    if rings<=5:
        return 0
    elif rings<=10:
        return 1
    elif rings<=15:
        return 2
    elif rings<=20:
        return 3
    elif rings<=25:
        return 4
    elif rings<=30:
        return 5
    else:
        return None
original_table.iloc[:,-1] = original_table.iloc[:,-1].apply(reScale)

In [4]:
np.unique(original_table["Rings"].values)

array([0, 1, 2, 3, 4, 5], dtype=int64)

### components

In [5]:
def init(data):
    """
    reset global variables
    """
    global QI_DICT, QI_RANGE, QI_ORDER
    # static values
    QI_DICT = []
    QI_ORDER = [list()]*len(names)
    QI_RANGE = [0]*len(names)
    att_values = []
    for i in range(len(names)):
        att_values.append(set())
        QI_DICT.append(dict())
    for record in data.values:
        for i in QI_INDEX:
            att_values[i].add(record[i])

    for i in QI_INDEX:
        value_list = list(att_values[i])
        value_list = sorted(value_list)
        QI_RANGE[i] = value_list[-1] - value_list[0]
        QI_ORDER[i] = list(value_list)
        for index, qi_value in enumerate(value_list):
            QI_DICT[i][qi_value] = index

In [6]:
class Partition(object):

    """
    Class for Group (or EC), which is used to keep records
    self.member: records in group
    self.low: lower point, use index to avoid negative values
    self.high: higher point, use index to avoid negative values
    self.allow: show if partition can be split on this QI
    """

    def __init__(self, data, low, high):
        """
        split_tuple = (index, low, high)
        """
        self.low = list(low)
        self.high = list(high)
        self.member = data
        self.allow = [1] * len(names)

    def add_record(self, record, dim):
        """
        add one record to member
        """
        self.member.append(record)

    def add_multiple_record(self, records, dim):
        """
        add multiple records (list) to partition
        """
        for record in records:
            self.add_record(record, dim)

    def __len__(self):
        """
        return number of records
        """
        return len(self.member)

In [7]:
def get_normalized_width(partition, dim):
    """
    return Normalized width of partition
    similar to NCP
    """
    d_order = QI_ORDER[dim]
    width = d_order[partition.high[dim]] - d_order[partition.low[dim]]
    if width == QI_RANGE[dim]:
        return 1
    return width * 1.0 / QI_RANGE[dim]

def choose_dimension(partition):
    # choose dim with largest norm_width from all attributes.
    # norm_width : partition's width / QI_RANGE's width  is belong to zero to one
    max_width = -1
    max_dim = -1
    for dim in QI_INDEX:
        if partition.allow[dim] == 0:
            continue
        norm_width = get_normalized_width(partition, dim)
        if norm_width > max_width:
            max_width = norm_width
            max_dim = dim
    return max_dim

In [8]:
def frequency_set(partition, dim):
    """
    get the frequency_set of partition on dim
    """
    frequency = {}
    for record in partition.member:
        try:
            frequency[record[dim]] += 1
        except KeyError:
            frequency[record[dim]] = 1
    return frequency
def find_median(partition, dim):
    """
    find the middle of the partition, return split_val, next_val, value_list[0], value_list[-1]
    """
    # use frequency set to get median
    frequency = frequency_set(partition, dim)
    split_val = ''
    next_val = ''
    value_list = list(frequency.keys())
    value_list = sorted(value_list)
    total = sum(frequency.values())
    middle = total // 2
    if middle < INPUT_K or len(value_list) <= 1:
        try:
            return '', '', value_list[0], value_list[-1]
        except IndexError:
            return '', '', '', ''
    index = 0
    split_index = 0
    for i, qi_value in enumerate(value_list):
        index += frequency[qi_value]
        if index >= middle:
            split_val = qi_value
            split_index = i
            break
    else:
        print("Error: cannot find split_val")
    try:
        next_val = value_list[split_index + 1]
    except IndexError:
        # there is a frequency value in partition
        # which can be handle by mid_set
        # e.g.[1, 2, 3, 4, 4, 4, 4]
        next_val = split_val
    return (split_val, next_val, value_list[0], value_list[-1])

In [9]:
def lDiversityChecker(partition, l):
    # print("Run lDiversityChecker()...")
    saValues = []
    for record in partition.member:
        saValue = record[SA_INDEX]
        saValues.append(saValue)
    saValues = np.array(saValues)
    unique, counts = np.unique(saValues, return_counts=True)
    p = counts / np.sum(counts)
    return bool(-np.sum(p * np.log10(p))>=np.log10(l))

def tClosenessChecker(partition, t):
    # print("Run tClosenessChecker()...")
    saValues = []
    for record in partition.member:
        saValue = record[SA_INDEX]
        saValues.append(saValue)
    # P1
    saValues = np.array(saValues)
    
    unique, counts = np.unique(saValues, return_counts=True)
    # partNum : 計算將SA的相異數分配給目前小部分資料的相異數, 得到有幾部分要分
    partNum = math.ceil(SA_CLASS_NUM/unique.size)
    summation = 0

    for index, value in enumerate(SA_CLASS):
        summation = summation + abs(value - unique[math.floor(index/partNum)])

    return bool((1/SA_CLASS_NUM) * (summation/(SA_CLASS_NUM-1)) <= t)

In [10]:
def anonymize(partition, k, l, t):
    # recursively partition groups until not allowable
    allow_count = 0
    for i in QI_INDEX:
        allow_count += partition.allow[i]
    if allow_count == 0:
        RESULT.append(partition)
        return
    
    for index in range(allow_count):
        # choose attrubite from domain
        dim = choose_dimension(partition)
        if dim == -1:
            print("Error: dim=-1")

        (split_val, next_val, low, high) = find_median(partition, dim)
        # Update parent low and high
        if low != '':
            partition.low[dim] = QI_DICT[dim][low]
            partition.high[dim] = QI_DICT[dim][high]
        if split_val == '' or split_val == next_val:
            # cannot split
            partition.allow[dim] = 0
            continue

        # split the group from median
        # lhs : low ~ split_val's index
        # rhs : next_val's index ~ hign
        mid_index = QI_DICT[dim][split_val]
        lhs_high = partition.high[:]
        rhs_low = partition.low[:]
        lhs_high[dim] = mid_index
        rhs_low[dim] = QI_DICT[dim][next_val]
        lhs = Partition([], partition.low, lhs_high)
        rhs = Partition([], rhs_low, partition.high)
        
        # use QI_DICT's index to split lhs and rhs, not real index.
        for record in partition.member:
            pos = QI_DICT[dim][record[dim]]
            if pos <= mid_index:
                # lhs = [low, mean]
                lhs.add_record(record, dim)
            else:
                # rhs = (mean, high]
                rhs.add_record(record, dim)
        # check is lhs and rhs satisfy k-anonymity
        if len(lhs) < k or len(rhs) < k or lDiversityChecker(partition=lhs, l=l)==False or lDiversityChecker(partition=rhs, l=l)==False or tClosenessChecker(partition=lhs, t=t)==False or tClosenessChecker(partition=rhs, t=t)==False:
            partition.allow[dim] = 0
            continue
        # anonymize sub-partition
        anonymize(lhs, k, l, t)
        anonymize(rhs, k, l, t)
        return
    RESULT.append(partition)
def merge_qi_value(left_value, right_value, connect_str = "~"):
    if left_value == right_value:
        result = f"{left_value}"
    else:
        result = f"[{left_value}{connect_str}{right_value}]"
    return result

In [11]:
def mondrian(data, k, l, t):
    startTime = time.time() * 1000
    init(data)
    """
        low : the first value of QI_ORDER's index
        high : the last value of QI_ORDER's index
    """
    low = [0] * len(QI_ORDER)
    high = [(len(v) - 1) for v in QI_ORDER]
    whole_partition = Partition(data.values, low, high)

    # begin mondrian
    anonymize(whole_partition, k, l, t)
    result = []
    for partition in RESULT:
        for record in partition.member:
            for index in QI_INDEX:
                # merge_qi_value like as [15~17].
                record[index] = merge_qi_value(QI_ORDER[index][partition.low[index]],
                                QI_ORDER[index][partition.high[index]])
            result.append(record)
    # end mondrian
    endTime = time.time() * 1000
    totalTime = endTime - startTime
    print(f"@Parameter : k={k}, l={l}, t={t}")
    print(f"The consumption of time : {round(totalTime)}ms")
    return result

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

def encoder_and_scaler(table):
   table_x = table.iloc[:,:-1]
   table_y = table.iloc[:,-1].values
   table_x = pd.get_dummies(table_x)
   sc = StandardScaler()
   table_x = sc.fit_transform(table_x.values)
   return table_x, table_y

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
# Import models from sklearn
from sklearn.svm import SVC
from xgboost import XGBClassifier

def print_metrics(y_true, preds):
   print('Accuracy score: ', format(accuracy_score(y_true, preds)))

def train_and_predict(X,y):
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
   # Instantiate a number of our models
   svm_mod = SVC()
   xg1 = XGBClassifier(verbosity = 0, use_label_encoder=False)

   # Fit each of the 4 models
   svm_mod.fit(X_train, y_train)
   xg1=xg1.fit(X_train, y_train)

   svm_y_pred = svm_mod.predict(X_test)
   xg1_y_pred = xg1.predict(X_test)

   # Print scores
   print_metrics(y_test, svm_y_pred)
   print_metrics(y_test, xg1_y_pred)
   print("======================================")

### main

In [13]:
SA_NAME = "Rings"
SA_INDEX = 8
SA_CLASS = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
SA_CLASS_NUM = len(SA_CLASS)
QI_NAME = ["Length", "Diameter", "Height", "Whole-weight", "Shucked-weight", "Viscera-weight", "Shell-weight"]
QI_INDEX = [1,2,3,4,5,6,7]
QI_LEN = len(QI_INDEX)
INPUT_K = 1000
INPUT_L = 0.8
INPUT_T = 0.8
"""
    RESULT : the collection of all partition
    QI_RANGE : attribute's maximum value - attribute's minimum value
    QI_DICT : To find out sorted value's index. ex : data=[3,7,8] -> QI_DICT={3:0, 7:1, 8:2}.
    QI_ORDER : the sorted data without repeat
"""
RESULT = []
QI_RANGE = []
QI_DICT = []
QI_ORDER = []

print("original table result : \n")
original_x, original_y = encoder_and_scaler(original_table)
train_and_predict(original_x, original_y)
            


result = mondrian(original_table.loc[:,:],INPUT_K, INPUT_L, INPUT_T)
released_table = pd.DataFrame(result, columns=names)
released_x, released_y = encoder_and_scaler(released_table)
train_and_predict(released_x, released_y)

original table result : 

Accuracy score:  0.7119617224880382
Accuracy score:  0.6813397129186602
@Parameter : k=1000, l=0.8, t=0.8
The consumption of time : 169ms
Accuracy score:  0.6421052631578947
Accuracy score:  0.6421052631578947


In [14]:
released_table = pd.DataFrame(result, columns=names)
released_table.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole-weight,Shucked-weight,Viscera-weight,Shell-weight,Rings
0,M,[0.075~0.515],[0.055~0.35],[0.0~0.18],[0.002~0.9225],[0.001~0.495],[0.0005~0.2075],[0.0015~0.3505],1
1,I,[0.075~0.515],[0.055~0.35],[0.0~0.18],[0.002~0.9225],[0.001~0.495],[0.0005~0.2075],[0.0015~0.3505],1
2,I,[0.075~0.515],[0.055~0.35],[0.0~0.18],[0.002~0.9225],[0.001~0.495],[0.0005~0.2075],[0.0015~0.3505],1
3,M,[0.075~0.515],[0.055~0.35],[0.0~0.18],[0.002~0.9225],[0.001~0.495],[0.0005~0.2075],[0.0015~0.3505],1
4,I,[0.075~0.515],[0.055~0.35],[0.0~0.18],[0.002~0.9225],[0.001~0.495],[0.0005~0.2075],[0.0015~0.3505],1


In [15]:
released_table.to_csv("k-anonmity+L-Diversity+T-closeness_abalone_data.csv", index=False)

In [16]:
train_and_predict(original_x, original_y)
train_and_predict(released_x, released_y)

Accuracy score:  0.7119617224880382
Accuracy score:  0.6813397129186602
Accuracy score:  0.6421052631578947
Accuracy score:  0.6421052631578947
