diff --git a/README.md b/README.md
index f9023fc..13f8623 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,7 @@
# python-synthpop
-Python implementation of the R package [synthpop](https://cran.r-project.org/web/packages/synthpop/index.html).
-
-```python-synthpop``` is an open-source library for synthetic data generation (SDG). The library includes robust implementations of Classification and Regression Trees (CART) and Gaussian Copula (GC) synthesizers, equipping users with an open-source python library to generate high-quality, privacy-preserving synthetic data.
+```python-synthpop``` is an open-source library for synthetic data generation (SDG). The library includes robust implementations of Classification and Regression Trees (CART) and Gaussian Copula (GC) synthesizers, equipping users with an open-source python library to generate high-quality, privacy-preserving synthetic data. This library is a Python implementation of the CART method used in R package [synthpop](https://cran.r-project.org/web/packages/synthpop/index.html).
Synthetic data is generated in six steps:
@@ -56,23 +54,25 @@ Out[2]:
### python-synthpop
-Using default parameters the six steps are applied on the Social Diagnosis example tot generate synthetic data. See also [link](./example_notebooks/00_readme.ipynb).
+Using default parameters the six steps are applied on the Social Diagnosis example to generate synthetic data. See also [link](./example_notebooks/00_readme.ipynb).
```
In [1]: from synthpop import MissingDataHandler, DataProcessor, CARTMethod
In [2]: # 1. Initiate metadata
- metadata = MissingDataHandler()
+ md_handler = MissingDataHandler()
- # 1.1 Detect data types
- column_dtypes = metadata.get_column_dtypes(df)
- print("Column Data Types:", column_dtypes)
+ # 1.1 Get data types
+ metadata= md_handler.get_column_dtypes(df)
+ print("Column Data Types:", metadata)
Column Data Types: {'sex': 'categorical', 'age': 'numerical', 'marital': 'categorical', 'income': 'numerical', 'ls': 'categorical', 'smoke': 'categorical'}
-In [3]: # 2. Missing data
+In [3]: # 2. Process missing data
+ print("Missing data:")
print(df.isnull().sum())
+ Missing data:
sex 0
age 0
marital 9
@@ -82,17 +82,19 @@ In [3]: # 2. Missing data
dtype: int64
In [4]: # 2.1 Detect type of missingness
- missingness_dict = metadata.detect_missingness(df)
- print("Detected missingness yype:", missingness_dict)
+ missingness_dict = md_handler.detect_missingness(df)
+ print("Detected missingness type:", missingness_dict)
Detected missingness type: {'marital': 'MAR', 'income': 'MAR', 'ls': 'MAR', 'smoke': 'MAR'}
In [5]: # 2.2 Impute missing values
- df_imputed = metadata.apply_imputation(df, missingness_dict)
+ real_df = md_handler.apply_imputation(df, missingness_dict)
- print(df_imputed.isnull().sum())
+ print("Missing data:")
+ print(real_df.isnull().sum())
+ Missing data:
sex 0
age 0
marital 0
@@ -102,25 +104,73 @@ In [5]: # 2.2 Impute missing values
dtype: int64
-In [6]: # 3. Instantiate the DataProcessor with column types
- processor = DataProcessor(column_dtypes)
+In [6]: # 3. Preprocessing: Instantiate the DataProcessor with column_dtypes
+ processor = DataProcessor(metadata)
# 3.1 Preprocess the data: transforms raw data into a numerical format
- processed_data = processor.preprocess(df)
- print("Processed Data:")
+ processed_data = processor.preprocess(real_df)
+ print("Processed data:")
display(processed_data.head())
- Processed Data:
+ Processed data:
sex age marital income ls smoke
- 0 0 0.503625 3 -0.480608 4 0
- 1 1 -1.495187 4 -0.834521 3 0
- 2 0 -1.603231 4 NaN 4 0
- 3 0 1.638086 5 -0.401961 1 0
- 4 0 0.341559 3 0.069923 3 1
+ 0 0 0.503625 3 -0.517232 4 0
+ 1 1 -1.495187 4 -0.898113 3 0
+ 2 0 -1.603231 4 0.000000 4 0
+ 3 0 1.638086 5 -0.432591 1 0
+ 4 0 0.341559 3 0.075251 3 1
+
In [7]: # 4. Fit the CART method
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
cart.fit(processed_data)
+In [8]: # 4.1 Preview generated synthetic data
+ synthetic_processed = cart.sample(100)
+ print("Synthetic processed data:")
+ display(synthetic_processed.head())
+
+ Synthetic processed data:
+ sex age marital income ls smoke
+ 0 1 -1.087360 3 -1.201126 4 0
+ 1 1 -0.882289 3 1.182255 4 0
+ 2 0 1.449201 5 -0.255936 2 0
+ 3 0 0.890598 3 0.220739 4 1
+ 4 0 0.313502 3 1.395039 4 0
+
+In [9]: # 5. Postprocessing: back to the original format and preview of data
+ synthetic_df = processor.postprocess(synthetic_processed)
+ print("Synthetic data in original format:")
+ display(synthetic_df.head())
+
+ Synthetic data in original format:
+ sex age marital income ls smoke
+ 0 FEMALE 30.377064 SINGLE -8.000000 MOSTLY DISSATISFIED NO
+ 1 MALE 54.823585 MARRIED 1861.809802 PLEASED YES
+ 2 FEMALE 78.641244 MARRIED 771.239134 MOSTLY DISSATISFIED NO
+ 3 MALE 53.458122 MARRIED 1758.942347 PLEASED NO
+ 4 FEMALE 60.354551 SINGLE 1024.351794 PLEASED NO
+
+In [10]: from synthpop.metrics import (
+ MetricsReport,
+ EfficacyMetrics,
+ DisclosureProtection
+ )
+
+In [11]: # 6. Evaluate the synthetic data
+
+ # 6.1 Diagnostic report
+ report = MetricsReport(real_df, synthetic_df, metadata)
+ report_df = report.generate_report()
+ print("=== Diagnostic Report ===")
+ display(report_df)
+
+ column type missing_value_similarity range_coverage boundary_adherence ks_complement tv_complement statistic_similarity category_coverage category_adherence
+ 0 sex categorical 1.0 N/A N/A N/A 0.9764 N/A 1.0 1.0
+ 1 age numerical 1.0 0.94757 1.0 0.9142 N/A 0.962239 N/A N/A
+ 2 marital categorical 1.0 N/A N/A N/A 0.967 N/A 0.666667 1.0
+ 3 income numerical 1.0 0.408926 1.0 0.9056 N/A 0.948719 N/A N/A
+ 4 ls categorical 1.0 N/A N/A N/A 0.9224 N/A 0.857143 1.0
+ 5 smoke categorical 1.0 N/A N/A N/A 0.9754 N/A 1.0 1.0
```
\ No newline at end of file
diff --git a/example_notebooks/00_readme.ipynb b/example_notebooks/00_readme.ipynb
index 1f1be22..241334d 100644
--- a/example_notebooks/00_readme.ipynb
+++ b/example_notebooks/00_readme.ipynb
@@ -155,6 +155,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
+ "Missing data:\n",
"sex 0\n",
"age 0\n",
"marital 9\n",
@@ -166,7 +167,8 @@
}
],
"source": [
- "# 2. Missing data\n",
+ "# 2. Process missing data\n",
+ "print(\"Missing data:\")\n",
"print(df.isnull().sum())"
]
},
@@ -198,6 +200,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
+ "Missing data:\n",
"sex 0\n",
"age 0\n",
"marital 0\n",
@@ -210,9 +213,10 @@
],
"source": [
"# 2.2 Impute missing values\n",
- "df_imputed = md_handler.apply_imputation(df, missingness_dict)\n",
+ "real_df = md_handler.apply_imputation(df, missingness_dict)\n",
"\n",
- "print(df_imputed.isnull().sum())"
+ "print(\"Missing data:\")\n",
+ "print(real_df.isnull().sum())"
]
},
{
@@ -224,7 +228,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processed Data:\n"
+ "Processed data:\n"
]
},
{
@@ -262,7 +266,7 @@
"
0 | \n",
" 0.503625 | \n",
" 3 | \n",
- " -0.480608 | \n",
+ " -0.517232 | \n",
" 4 | \n",
" 0 | \n",
" \n",
@@ -271,7 +275,7 @@
" 1 | \n",
" -1.495187 | \n",
" 4 | \n",
- " -0.834521 | \n",
+ " -0.898113 | \n",
" 3 | \n",
" 0 | \n",
" \n",
@@ -280,7 +284,7 @@
" 0 | \n",
" -1.603231 | \n",
" 4 | \n",
- " NaN | \n",
+ " 0.000000 | \n",
" 4 | \n",
" 0 | \n",
" \n",
@@ -289,7 +293,7 @@
" 0 | \n",
" 1.638086 | \n",
" 5 | \n",
- " -0.401961 | \n",
+ " -0.432591 | \n",
" 1 | \n",
" 0 | \n",
" \n",
@@ -298,7 +302,7 @@
" 0 | \n",
" 0.341559 | \n",
" 3 | \n",
- " 0.069923 | \n",
+ " 0.075251 | \n",
" 3 | \n",
" 1 | \n",
" \n",
@@ -308,11 +312,11 @@
],
"text/plain": [
" sex age marital income ls smoke\n",
- "0 0 0.503625 3 -0.480608 4 0\n",
- "1 1 -1.495187 4 -0.834521 3 0\n",
- "2 0 -1.603231 4 NaN 4 0\n",
- "3 0 1.638086 5 -0.401961 1 0\n",
- "4 0 0.341559 3 0.069923 3 1"
+ "0 0 0.503625 3 -0.517232 4 0\n",
+ "1 1 -1.495187 4 -0.898113 3 0\n",
+ "2 0 -1.603231 4 0.000000 4 0\n",
+ "3 0 1.638086 5 -0.432591 1 0\n",
+ "4 0 0.341559 3 0.075251 3 1"
]
},
"metadata": {},
@@ -320,12 +324,12 @@
}
],
"source": [
- "# 3. Instantiate the DataProcessor with column_dtypes\n",
+ "# 3. Preprocessing: Instantiate the DataProcessor with column_dtypes\n",
"processor = DataProcessor(metadata)\n",
"\n",
"# 3.1 Preprocess the data: transforms raw data into a numerical format\n",
- "processed_data = processor.preprocess(df)\n",
- "print(\"Processed Data:\")\n",
+ "processed_data = processor.preprocess(real_df)\n",
+ "print(\"Processed data:\")\n",
"display(processed_data.head())"
]
},
@@ -333,15 +337,7 @@
"cell_type": "code",
"execution_count": 9,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "ERROR:synthpop.method.cart:Error fitting model for column 'income': Input y contains NaN.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# 4. Fit the CART method\n",
"cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)\n",
@@ -350,14 +346,14 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Synthetic Processed Data:\n"
+ "Synthetic processed data:\n"
]
},
{
@@ -384,6 +380,7 @@
" sex | \n",
" age | \n",
" marital | \n",
+ " income | \n",
" ls | \n",
" smoke | \n",
" \n",
@@ -391,42 +388,47 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " -1.123252 | \n",
+ " 1 | \n",
+ " -0.716885 | \n",
" 4 | \n",
+ " -1.189097 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
- " 0.704909 | \n",
+ " -1.066729 | \n",
" 3 | \n",
+ " 0.057878 | \n",
" 4 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
- " 1.583713 | \n",
- " 5 | \n",
+ " 1.552391 | \n",
" 3 | \n",
+ " -0.754037 | \n",
+ " 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
- " -0.127991 | \n",
+ " 0.522026 | \n",
" 3 | \n",
+ " 0.337329 | \n",
" 4 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
- " 0.868010 | \n",
+ " 0.262577 | \n",
+ " 3 | \n",
+ " -1.179427 | \n",
" 3 | \n",
- " 4 | \n",
" 0 | \n",
"
\n",
" \n",
@@ -434,12 +436,12 @@
""
],
"text/plain": [
- " sex age marital ls smoke\n",
- "0 0 -1.123252 4 2 0\n",
- "1 1 0.704909 3 4 1\n",
- "2 0 1.583713 5 3 0\n",
- "3 0 -0.127991 3 4 1\n",
- "4 0 0.868010 3 4 0"
+ " sex age marital income ls smoke\n",
+ "0 1 -0.716885 4 -1.189097 2 0\n",
+ "1 1 -1.066729 3 0.057878 4 1\n",
+ "2 0 1.552391 3 -0.754037 2 0\n",
+ "3 0 0.522026 3 0.337329 4 1\n",
+ "4 0 0.262577 3 -1.179427 3 0"
]
},
"metadata": {},
@@ -449,41 +451,124 @@
"source": [
"# 4.1 Preview generated synthetic data\n",
"synthetic_processed = cart.sample(100)\n",
- "print(\"Synthetic Processed Data:\")\n",
+ "print(\"Synthetic processed data:\")\n",
"display(synthetic_processed.head())"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
- "ename": "KeyError",
- "evalue": "\"None of [Index(['income'], dtype='object')] are in the [columns]\"",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# 4.2 Postprocess the synthetic data back to the original format\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m synthetic_data \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpostprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43msynthetic_processed\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSynthetic Data in Original Format:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m display(synthetic_data\u001b[38;5;241m.\u001b[39mhead())\n",
- "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/synthpop/processor/data_processor.py:90\u001b[0m, in \u001b[0;36mDataProcessor.postprocess\u001b[0;34m(self, synthetic_data)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumerical\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscalers:\n\u001b[1;32m 89\u001b[0m scaler \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscalers[col]\n\u001b[0;32m---> 90\u001b[0m synthetic_data[col] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39minverse_transform(\u001b[43msynthetic_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcol\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboolean\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 93\u001b[0m synthetic_data[col] \u001b[38;5;241m=\u001b[39m synthetic_data[col]\u001b[38;5;241m.\u001b[39mround()\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mbool\u001b[39m)\n",
- "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4107\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4108\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
- "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
- "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/indexes/base.py:6249\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing:\n\u001b[1;32m 6248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(indexer):\n\u001b[0;32m-> 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[1;32m 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
- "\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['income'], dtype='object')] are in the [columns]\""
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Synthetic data in original format:\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sex | \n",
+ " age | \n",
+ " marital | \n",
+ " income | \n",
+ " ls | \n",
+ " smoke | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " MALE | \n",
+ " 34.407146 | \n",
+ " SINGLE | \n",
+ " 6.211859 | \n",
+ " MOSTLY DISSATISFIED | \n",
+ " NO | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " MALE | \n",
+ " 27.931190 | \n",
+ " MARRIED | \n",
+ " 1479.474304 | \n",
+ " PLEASED | \n",
+ " YES | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " FEMALE | \n",
+ " 76.413698 | \n",
+ " MARRIED | \n",
+ " 520.222172 | \n",
+ " MOSTLY DISSATISFIED | \n",
+ " NO | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " FEMALE | \n",
+ " 57.340625 | \n",
+ " MARRIED | \n",
+ " 1809.637340 | \n",
+ " PLEASED | \n",
+ " YES | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " FEMALE | \n",
+ " 52.537967 | \n",
+ " MARRIED | \n",
+ " 17.637157 | \n",
+ " MOSTLY SATISFIED | \n",
+ " NO | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sex age marital income ls smoke\n",
+ "0 MALE 34.407146 SINGLE 6.211859 MOSTLY DISSATISFIED NO\n",
+ "1 MALE 27.931190 MARRIED 1479.474304 PLEASED YES\n",
+ "2 FEMALE 76.413698 MARRIED 520.222172 MOSTLY DISSATISFIED NO\n",
+ "3 FEMALE 57.340625 MARRIED 1809.637340 PLEASED YES\n",
+ "4 FEMALE 52.537967 MARRIED 17.637157 MOSTLY SATISFIED NO"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
- "# 4.2 Postprocess the synthetic data back to the original format and give preview of generated synthetic data\n",
- "synthetic_data = processor.postprocess(synthetic_processed)\n",
- "print(\"Synthetic Data in Original Format:\")\n",
- "display(synthetic_data.head())"
+ "# 5. Postprocessing: back to the original format and preview of data\n",
+ "synthetic_df = processor.postprocess(synthetic_processed)\n",
+ "print(\"Synthetic data in original format:\")\n",
+ "display(synthetic_df.head())"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -493,6 +578,259 @@
" DisclosureProtection\n",
")"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "=== Diagnostic Report ===\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column | \n",
+ " type | \n",
+ " missing_value_similarity | \n",
+ " range_coverage | \n",
+ " boundary_adherence | \n",
+ " ks_complement | \n",
+ " tv_complement | \n",
+ " statistic_similarity | \n",
+ " category_coverage | \n",
+ " category_adherence | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " sex | \n",
+ " categorical | \n",
+ " 1.0 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " 0.9764 | \n",
+ " N/A | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " age | \n",
+ " numerical | \n",
+ " 1.0 | \n",
+ " 0.94757 | \n",
+ " 1.0 | \n",
+ " 0.9142 | \n",
+ " N/A | \n",
+ " 0.962239 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " marital | \n",
+ " categorical | \n",
+ " 1.0 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " 0.967 | \n",
+ " N/A | \n",
+ " 0.666667 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " income | \n",
+ " numerical | \n",
+ " 1.0 | \n",
+ " 0.408926 | \n",
+ " 1.0 | \n",
+ " 0.9056 | \n",
+ " N/A | \n",
+ " 0.948719 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " ls | \n",
+ " categorical | \n",
+ " 1.0 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " 0.9224 | \n",
+ " N/A | \n",
+ " 0.857143 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " smoke | \n",
+ " categorical | \n",
+ " 1.0 | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " N/A | \n",
+ " 0.9754 | \n",
+ " N/A | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column type missing_value_similarity range_coverage \\\n",
+ "0 sex categorical 1.0 N/A \n",
+ "1 age numerical 1.0 0.94757 \n",
+ "2 marital categorical 1.0 N/A \n",
+ "3 income numerical 1.0 0.408926 \n",
+ "4 ls categorical 1.0 N/A \n",
+ "5 smoke categorical 1.0 N/A \n",
+ "\n",
+ " boundary_adherence ks_complement tv_complement statistic_similarity \\\n",
+ "0 N/A N/A 0.9764 N/A \n",
+ "1 1.0 0.9142 N/A 0.962239 \n",
+ "2 N/A N/A 0.967 N/A \n",
+ "3 1.0 0.9056 N/A 0.948719 \n",
+ "4 N/A N/A 0.9224 N/A \n",
+ "5 N/A N/A 0.9754 N/A \n",
+ "\n",
+ " category_coverage category_adherence \n",
+ "0 1.0 1.0 \n",
+ "1 N/A N/A \n",
+ "2 0.666667 1.0 \n",
+ "3 N/A N/A \n",
+ "4 0.857143 1.0 \n",
+ "5 1.0 1.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# 6. Evaluate the synthetic data\n",
+ "\n",
+ "# 6.1 Diagnostic report\n",
+ "report = MetricsReport(real_df, synthetic_df, metadata)\n",
+ "report_df = report.generate_report()\n",
+ "print(\"=== Diagnostic Report ===\")\n",
+ "display(report_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "could not convert string to float: 'MALE'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/var/folders/v8/64zc34sn3b95klfm660085h80000gn/T/ipykernel_9335/3414886545.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# 6.2 Efficacy metrics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# regression\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mreg_efficacy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEfficacyMetrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regression'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_column\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"income\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mreg_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreg_efficacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreal_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msynthetic_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"=== Regression Efficacy Metrics ===\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreg_metrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/synthpop/metrics/efficacy_metrics.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, real_df, synthetic_df)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;31m# Model Training and Evaluation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtask\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'regression'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_syn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_syn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_real\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mmse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmean_squared_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_real\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0mmae\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmean_absolute_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_real\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m skip_parameter_validation=(\n\u001b[1;32m 1471\u001b[0m \u001b[0mprefer_skip_nested_validation\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mglobal_skip_validation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1472\u001b[0m )\n\u001b[1;32m 1473\u001b[0m ):\n\u001b[0;32m-> 1474\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfit_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 574\u001b[0m \u001b[0mn_jobs_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpositive\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"csr\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"coo\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 578\u001b[0;31m X, y = self._validate_data(\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_numeric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m )\n\u001b[1;32m 581\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 650\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ensure_2d\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1259\u001b[0m raise ValueError(\n\u001b[1;32m 1260\u001b[0m \u001b[0;34mf\"{estimator_name} requires y to be passed, but the target y is None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1261\u001b[0m )\n\u001b[1;32m 1262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1263\u001b[0;31m X = check_array(\n\u001b[0m\u001b[1;32m 1264\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1265\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1266\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 994\u001b[0m )\n\u001b[1;32m 995\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 996\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 997\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 998\u001b[0;31m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 999\u001b[0m raise ValueError(\n\u001b[1;32m 1000\u001b[0m \u001b[0;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1001\u001b[0m ) from complex_warning\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/_array_api.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;31m# Use NumPy API to support order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 2149\u001b[0m def __array__(\n\u001b[1;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2151\u001b[0m ) -> np.ndarray:\n\u001b[1;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2153\u001b[0;31m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2154\u001b[0m if (\n\u001b[1;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2156\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'MALE'"
+ ]
+ }
+ ],
+ "source": [
+ "# 6.2 Efficacy metrics\n",
+ "\n",
+ "# regression\n",
+ "reg_efficacy = EfficacyMetrics(task='regression', target_column=\"income\")\n",
+ "reg_metrics = reg_efficacy.evaluate(real_df, synthetic_df)\n",
+ "print(\"=== Regression Efficacy Metrics ===\")\n",
+ "print(reg_metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== Classification Efficacy Metrics ===\n",
+ "{'accuracy': 0.6392, 'f1_score': 0.6481509447474609}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# classification\n",
+ "clf_efficacy = EfficacyMetrics(task='classification', target_column=\"smoke\")\n",
+ "clf_metrics = clf_efficacy.evaluate(real_df, synthetic_df)\n",
+ "print(\"\\n=== Classification Efficacy Metrics ===\")\n",
+ "print(clf_metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== Disclosure Protection ===\n",
+ "Score: 1.000\n",
+ "Detailed Report: {'threshold': 0.0, 'risk_rate': 0.0, 'disclosure_protection_score': 1.0}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# privacy\n",
+ "dp = DisclosureProtection(real_df, synthetic_df)\n",
+ "dp_score = dp.score()\n",
+ "dp_report = dp.report()\n",
+ "\n",
+ "print(\"\\n=== Disclosure Protection ===\")\n",
+ "print(f\"Score: {dp_score:.3f}\")\n",
+ "print(\"Detailed Report:\", dp_report)"
+ ]
}
],
"metadata": {