In [128]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import shutil

from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from configparser import ConfigParser, BasicInterpolation, ExtendedInterpolation


def read_config():
	cfg = ConfigParser(interpolation=ExtendedInterpolation())
	cfg.read('config.ini')
	# PATHS
	global OUTPUT_BASE_PATH, EXPERIMENT_STATS_PATH, HIERARCHIES_BASE_PATH, TRAIN_DF_PATH
	OUTPUT_BASE_PATH = Path(cfg['PATHS']['output_base_path']).resolve()
	EXPERIMENT_STATS_PATH = Path(cfg['PATHS']['experiment_stats_path']).resolve()
	HIERARCHIES_BASE_PATH = Path(cfg['PATHS']['hierarchies_base_path']).resolve()
	TRAIN_DF_PATH = Path(cfg['PATHS']['input_dataset_path']).resolve() / 'train.csv'

	# BOOLEANS
	global MLBALANCE, PRIVACY_METRICS
	MLBALANCE = cfg.getboolean('BOOLEANS', 'mlbalance')
	PRIVACY_METRICS = cfg.getboolean('BOOLEANS', 'privacy_metrics')

	# OTHER
	global QID_LIST, POPULATION_DF
	QID_LIST = get_qid(EXPERIMENT_STATS_PATH)
	POPULATION_DF = pd.read_csv(TRAIN_DF_PATH, delimiter=';', decimal=',', dtype=str) #pay attention to dtype when using it to generalize, must be str



def certainty(population_df: pd.DataFrame, sample_df: pd.DataFrame, attributes: list) -> pd.Series:
	population_eq = population_df.groupby(attributes).size()
	sample_eq = sample_df.groupby(attributes).size()

	result = sample_eq.div(population_eq).fillna(0).reset_index(level=attributes)
	result2 = sample_df.merge(result, on=attributes, how='left')
	return result2[0]


def journalist_risk(population_df: pd.DataFrame, sample_df: pd.DataFrame, attributes: list) -> pd.Series:
	population_eq = population_df.groupby(attributes).size().apply(lambda x: 1 / x).reset_index(level=attributes)

	result = sample_df.merge(population_eq, on=attributes, how='left').fillna(0)
	suppressed_records = sample_df[sample_df[attributes] == '*'][attributes].dropna()
	result.iloc[suppressed_records.index, -1] = 0
	return result[result.columns[-1]]


def read_attributes(settingsPath: str, attributeType: str) -> list:
	settings = pd.read_csv(settingsPath, delimiter=';')
	attributes = settings[attributeType].to_list()[0][1:-1].split(', ')
	return attributes

def get_qid(experiment_stats_path) -> list:
	experiment_stats = pd.read_csv(experiment_stats_path, delimiter=';')
	return experiment_stats["QID"].to_list()[0][1:-1].split(', ')

def drop_suppressed(df, qid):
	if qid is not None:
		suppressed_rows = df.groupby(qid).get_group(name=tuple(['*'] * len(qid))).index
		df.drop(suppressed_rows, inplace=True)
	return df.infer_objects()

def get_generalised(dataset: pd.DataFrame, stats_file: Path) -> pd.DataFrame:
	dataset = dataset.copy()
	stats_df = pd.read_csv(stats_file, sep=';', decimal=',')
	gen_levels = stats_df['node'].tolist()[0][1:-1].split(', ')
	gen_levels = [int(level) for level in gen_levels]
	
	for level, qi in zip(gen_levels, QID_LIST):
		hierarchy = pd.read_csv(HIERARCHIES_BASE_PATH/f'{qi}.csv', sep=';', decimal=',', header=None, dtype=str) #pay attention to dtype
		hierarchy.set_index(hierarchy.columns[0], drop=False, inplace=True)
		generalization = hierarchy[level]
		dataset[qi] = dataset[qi].map(generalization)
	return dataset


def calculate_privacy_metrics(sample_strategy):
	sample_base_path = OUTPUT_BASE_PATH / sample_strategy

	samples = []
	k_directories = [sample_base_path / d for d in os.listdir(sample_base_path)]

	for k_dir in k_directories:
		k_stats_file_path = OUTPUT_BASE_PATH / 'kAnon' / k_dir.name / 'stats.csv'
		dict = {'stats_file_path': k_stats_file_path, "sample_paths": []}
		for sample_folder_name in os.listdir(k_dir):
			dict['sample_paths'].append(Path(k_dir) / sample_folder_name / f'{sample_folder_name}_sample.csv')
		samples.append(dict)

	for dict in samples:
		population_df_generalized = get_generalised(POPULATION_DF, dict['stats_file_path'])
		for sample_path in dict['sample_paths']:
			privacystats_dir = sample_path.parent / 'privacystats'
			if privacystats_dir.exists():
				shutil.rmtree(privacystats_dir)
			os.makedirs(privacystats_dir)
			
			sample_df = pd.read_csv(sample_path, delimiter=';')
			certainty_distribution = certainty(population_df_generalized, sample_df, QID_LIST)
			certainty_distribution.to_csv(privacystats_dir / 'certainty.csv', sep=';')
			
			journalist_risk_distribution = journalist_risk(population_df_generalized, sample_df, QID_LIST)
			journalist_risk_distribution.to_csv(privacystats_dir / 'journalistRisk.csv', sep=';')
	




In [129]:
def certainty(population_df: pd.DataFrame, sample_df: pd.DataFrame, attributes: list) -> pd.Series:
	population_eq = population_df.groupby(attributes).size()
	sample_eq = sample_df.groupby(attributes).size()

	result = sample_eq.div(population_eq).fillna(0).reset_index(level=attributes)
	result2 = sample_df.merge(result, on=attributes, how='left')
	return result2[0]

In [130]:
def get_generalised(dataset: pd.DataFrame, stats_file: Path) -> pd.DataFrame:
	dataset = dataset.copy()
	stats_df = pd.read_csv(stats_file, sep=';', decimal=',')
	gen_levels = stats_df['node'].tolist()[0][1:-1].split(', ')
	gen_levels = [int(level) for level in gen_levels]
	print(gen_levels)
	print(QID_LIST)
	
	
	for level, qi in zip(gen_levels, QID_LIST):
		hierarchy = pd.read_csv(HIERARCHIES_BASE_PATH/f'{qi}.csv', sep=';', decimal=',', header=None, dtype=str) #pay attention to dtype
		hierarchy.set_index(hierarchy.columns[0], drop=False, inplace=True)
		generalization = hierarchy[level]
		dataset[qi] = dataset[qi].map(generalization)
	return dataset

In [131]:
read_config()

In [132]:
population_df_generalized = get_generalised(POPULATION_DF, r'C:\Users\tibol\Desktop\FIIW Tibo Laperre\fase 5 - thesis\thesis-projectV2\data\results\ACSIncome_USA_2018_binned_imbalanced_16645\kAnon\k5\stats.csv')
sample_df = pd.read_csv(r'C:\Users\tibol\Desktop\FIIW Tibo Laperre\fase 5 - thesis\thesis-projectV2\data\results\ACSIncome_USA_2018_binned_imbalanced_16645\SSample\k5\B(0.5)\B(0.5)_sample.csv', delimiter=';')
population_df_generalized.iloc[1]

[4, 1, 3, 0, 3, 1, 0, 1]
['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'SEX', 'RAC1P']


AGEP             [40, 80[
COW        Non-Government
SCHL     Higher education
MAR               Married
OCCP         Nontechnical
POBP        North America
RELP     Reference person
WKHP                 70.0
SEX                Female
RAC1P               White
PINCP      [20000-100000[
Name: 1, dtype: object

In [151]:
sample_df

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,*,*,*,*,*,*,Reference person,16.0,*,*,[20000-100000[
1,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,70.0,Female,White,[20000-100000[
2,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,60.0,Male,White,[100000-inf[
3,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,40.0,Male,White,[20000-100000[
4,"[0, 40[",Non-Government,Secondary education,Married,Other,North America,Son-in-law or daughter-in-law,20.0,Male,White,[0-20000[
...,...,...,...,...,...,...,...,...,...,...,...
6653,*,*,*,*,*,*,Husband/wife,40.0,*,*,[20000-100000[
6654,"[40, 80[",Non-Government,Secondary education,Married,Nontechnical,North America,Reference person,50.0,Male,White,[20000-100000[
6655,"[0, 40[",Non-Government,Higher education,Never married or under 15 years old,Nontechnical,North America,Noninstitutionalized group quarters population,20.0,Female,White,[0-20000[
6656,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,Asia,Husband/wife,45.0,Female,Asian-Pac-Islander,[20000-100000[


In [134]:
population_eq = population_df_generalized.groupby(QID_LIST).size()
population_eq.to_csv('population_eq.csv', sep=';')

In [135]:
masked_df = pd.read_csv(r'C:\Users\tibol\Desktop\FIIW Tibo Laperre\fase 5 - thesis\thesis-projectV2\data\results\ACSIncome_USA_2018_binned_imbalanced_16645\kAnon\k5\output_sample.csv', delimiter=';')
masked_eq = masked_df.groupby(QID_LIST).size()
masked_eq.to_csv('masked_eq.csv', sep=';')


In [136]:
sample_eq = sample_df.groupby(QID_LIST).size()
sample_eq.to_csv('sample_eq.csv', sep=';')

In [137]:
result = sample_eq.div(population_eq).fillna(0).reset_index(level=QID_LIST)
# result = sample_eq.div(population_eq)
result

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,SEX,RAC1P,0
0,*,*,*,*,*,*,*,*,0.000000
1,"[0, 40[",Government,Higher education,Divorced,Nontechnical,North America,Female,Black,0.000000
2,"[0, 40[",Government,Higher education,Divorced,Nontechnical,North America,Female,Other,0.000000
3,"[0, 40[",Government,Higher education,Divorced,Nontechnical,North America,Female,White,0.461538
4,"[0, 40[",Government,Higher education,Divorced,Nontechnical,North America,Male,White,0.400000
...,...,...,...,...,...,...,...,...,...
1035,"[80, 100[",Non-Government,Secondary education,Never married or under 15 years old,Nontechnical,North America,Male,White,0.000000
1036,"[80, 100[",Non-Government,Secondary education,Widowed,Nontechnical,North America,Female,Black,0.000000
1037,"[80, 100[",Non-Government,Secondary education,Widowed,Nontechnical,North America,Female,White,0.000000
1038,"[80, 100[",Non-Government,Secondary education,Widowed,Nontechnical,North America,Male,White,0.000000


In [153]:
result2 = sample_df.merge(result, on=QID_LIST, how='left')
# result2[0].to_csv('cert.csv',sep=';', index=False)
result2

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP,0
0,*,*,*,*,*,*,Reference person,16.0,*,*,[20000-100000[,0.000000
1,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,70.0,Female,White,[20000-100000[,0.509950
2,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,60.0,Male,White,[100000-inf[,0.509554
3,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,40.0,Male,White,[20000-100000[,0.509554
4,"[0, 40[",Non-Government,Secondary education,Married,Other,North America,Son-in-law or daughter-in-law,20.0,Male,White,[0-20000[,0.507937
...,...,...,...,...,...,...,...,...,...,...,...,...
6653,*,*,*,*,*,*,Husband/wife,40.0,*,*,[20000-100000[,0.000000
6654,"[40, 80[",Non-Government,Secondary education,Married,Nontechnical,North America,Reference person,50.0,Male,White,[20000-100000[,0.508108
6655,"[0, 40[",Non-Government,Higher education,Never married or under 15 years old,Nontechnical,North America,Noninstitutionalized group quarters population,20.0,Female,White,[0-20000[,0.508475
6656,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,Asia,Husband/wife,45.0,Female,Asian-Pac-Islander,[20000-100000[,0.500000


In [158]:
result3 = population_df_generalized.merge(result, on=QID_LIST, how='left')
result3[0].to_csv('cert_2.csv',sep=';', index=False)
result3


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP,0
0,"[40, 80[",Non-Government,Higher education,Widowed,Other,South America,Reference person,16.0,Female,White,[20000-100000[,0.000000
1,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,70.0,Female,White,[20000-100000[,0.509950
2,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,60.0,Male,White,[100000-inf[,0.509554
3,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,40.0,Male,White,[20000-100000[,0.509554
4,"[0, 40[",Non-Government,Secondary education,Married,Other,North America,Son-in-law or daughter-in-law,20.0,Male,White,[0-20000[,0.507937
...,...,...,...,...,...,...,...,...,...,...,...,...
13311,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,Asia,Husband/wife,45.0,Female,Asian-Pac-Islander,[20000-100000[,0.500000
13312,"[40, 80[",Government,Higher education,Never married or under 15 years old,Nontechnical,North America,Reference person,40.0,Female,White,[20000-100000[,0.500000
13313,"[40, 80[",Non-Government,Higher education,Married,Nontechnical,North America,Reference person,50.0,Male,White,[100000-inf[,0.509554
13314,"[40, 80[",Non-Government,Higher education,Married,Technical,North America,Husband/wife,40.0,Male,White,[20000-100000[,0.507042


In [165]:
print(f'non-zero values for result2:  {(result2[0] != 0.0).sum()}')
print(f'non-zero values for result3:  {(result3[0] != 0.0).sum()}')

non-zero values for result2:  6058
non-zero values for result3:  12139


In [162]:
population_df_generalized.to_csv('population_df_generalized.csv', sep=';', index=False, columns=QID_LIST)