diff --git a/README.md b/README.md index f9023fc..13f8623 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,7 @@ # python-synthpop -Python implementation of the R package [synthpop](https://cran.r-project.org/web/packages/synthpop/index.html). - -```python-synthpop``` is an open-source library for synthetic data generation (SDG). The library includes robust implementations of Classification and Regression Trees (CART) and Gaussian Copula (GC) synthesizers, equipping users with an open-source python library to generate high-quality, privacy-preserving synthetic data. +```python-synthpop``` is an open-source library for synthetic data generation (SDG). The library includes robust implementations of Classification and Regression Trees (CART) and Gaussian Copula (GC) synthesizers, equipping users with an open-source python library to generate high-quality, privacy-preserving synthetic data. This library is a Python implementation of the CART method used in R package [synthpop](https://cran.r-project.org/web/packages/synthpop/index.html). Synthetic data is generated in six steps: @@ -56,23 +54,25 @@ Out[2]: ### python-synthpop -Using default parameters the six steps are applied on the Social Diagnosis example tot generate synthetic data. See also [link](./example_notebooks/00_readme.ipynb). +Using default parameters the six steps are applied on the Social Diagnosis example to generate synthetic data. See also [link](./example_notebooks/00_readme.ipynb). ``` In [1]: from synthpop import MissingDataHandler, DataProcessor, CARTMethod In [2]: # 1. Initiate metadata - metadata = MissingDataHandler() + md_handler = MissingDataHandler() - # 1.1 Detect data types - column_dtypes = metadata.get_column_dtypes(df) - print("Column Data Types:", column_dtypes) + # 1.1 Get data types + metadata= md_handler.get_column_dtypes(df) + print("Column Data Types:", metadata) Column Data Types: {'sex': 'categorical', 'age': 'numerical', 'marital': 'categorical', 'income': 'numerical', 'ls': 'categorical', 'smoke': 'categorical'} -In [3]: # 2. Missing data +In [3]: # 2. Process missing data + print("Missing data:") print(df.isnull().sum()) + Missing data: sex 0 age 0 marital 9 @@ -82,17 +82,19 @@ In [3]: # 2. Missing data dtype: int64 In [4]: # 2.1 Detect type of missingness - missingness_dict = metadata.detect_missingness(df) - print("Detected missingness yype:", missingness_dict) + missingness_dict = md_handler.detect_missingness(df) + print("Detected missingness type:", missingness_dict) Detected missingness type: {'marital': 'MAR', 'income': 'MAR', 'ls': 'MAR', 'smoke': 'MAR'} In [5]: # 2.2 Impute missing values - df_imputed = metadata.apply_imputation(df, missingness_dict) + real_df = md_handler.apply_imputation(df, missingness_dict) - print(df_imputed.isnull().sum()) + print("Missing data:") + print(real_df.isnull().sum()) + Missing data: sex 0 age 0 marital 0 @@ -102,25 +104,73 @@ In [5]: # 2.2 Impute missing values dtype: int64 -In [6]: # 3. Instantiate the DataProcessor with column types - processor = DataProcessor(column_dtypes) +In [6]: # 3. Preprocessing: Instantiate the DataProcessor with column_dtypes + processor = DataProcessor(metadata) # 3.1 Preprocess the data: transforms raw data into a numerical format - processed_data = processor.preprocess(df) - print("Processed Data:") + processed_data = processor.preprocess(real_df) + print("Processed data:") display(processed_data.head()) - Processed Data: + Processed data: sex age marital income ls smoke - 0 0 0.503625 3 -0.480608 4 0 - 1 1 -1.495187 4 -0.834521 3 0 - 2 0 -1.603231 4 NaN 4 0 - 3 0 1.638086 5 -0.401961 1 0 - 4 0 0.341559 3 0.069923 3 1 + 0 0 0.503625 3 -0.517232 4 0 + 1 1 -1.495187 4 -0.898113 3 0 + 2 0 -1.603231 4 0.000000 4 0 + 3 0 1.638086 5 -0.432591 1 0 + 4 0 0.341559 3 0.075251 3 1 + In [7]: # 4. Fit the CART method cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42) cart.fit(processed_data) +In [8]: # 4.1 Preview generated synthetic data + synthetic_processed = cart.sample(100) + print("Synthetic processed data:") + display(synthetic_processed.head()) + + Synthetic processed data: + sex age marital income ls smoke + 0 1 -1.087360 3 -1.201126 4 0 + 1 1 -0.882289 3 1.182255 4 0 + 2 0 1.449201 5 -0.255936 2 0 + 3 0 0.890598 3 0.220739 4 1 + 4 0 0.313502 3 1.395039 4 0 + +In [9]: # 5. Postprocessing: back to the original format and preview of data + synthetic_df = processor.postprocess(synthetic_processed) + print("Synthetic data in original format:") + display(synthetic_df.head()) + + Synthetic data in original format: + sex age marital income ls smoke + 0 FEMALE 30.377064 SINGLE -8.000000 MOSTLY DISSATISFIED NO + 1 MALE 54.823585 MARRIED 1861.809802 PLEASED YES + 2 FEMALE 78.641244 MARRIED 771.239134 MOSTLY DISSATISFIED NO + 3 MALE 53.458122 MARRIED 1758.942347 PLEASED NO + 4 FEMALE 60.354551 SINGLE 1024.351794 PLEASED NO + +In [10]: from synthpop.metrics import ( + MetricsReport, + EfficacyMetrics, + DisclosureProtection + ) + +In [11]: # 6. Evaluate the synthetic data + + # 6.1 Diagnostic report + report = MetricsReport(real_df, synthetic_df, metadata) + report_df = report.generate_report() + print("=== Diagnostic Report ===") + display(report_df) + + column type missing_value_similarity range_coverage boundary_adherence ks_complement tv_complement statistic_similarity category_coverage category_adherence + 0 sex categorical 1.0 N/A N/A N/A 0.9764 N/A 1.0 1.0 + 1 age numerical 1.0 0.94757 1.0 0.9142 N/A 0.962239 N/A N/A + 2 marital categorical 1.0 N/A N/A N/A 0.967 N/A 0.666667 1.0 + 3 income numerical 1.0 0.408926 1.0 0.9056 N/A 0.948719 N/A N/A + 4 ls categorical 1.0 N/A N/A N/A 0.9224 N/A 0.857143 1.0 + 5 smoke categorical 1.0 N/A N/A N/A 0.9754 N/A 1.0 1.0 ``` \ No newline at end of file diff --git a/example_notebooks/00_readme.ipynb b/example_notebooks/00_readme.ipynb index 1f1be22..241334d 100644 --- a/example_notebooks/00_readme.ipynb +++ b/example_notebooks/00_readme.ipynb @@ -155,6 +155,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "Missing data:\n", "sex 0\n", "age 0\n", "marital 9\n", @@ -166,7 +167,8 @@ } ], "source": [ - "# 2. Missing data\n", + "# 2. Process missing data\n", + "print(\"Missing data:\")\n", "print(df.isnull().sum())" ] }, @@ -198,6 +200,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "Missing data:\n", "sex 0\n", "age 0\n", "marital 0\n", @@ -210,9 +213,10 @@ ], "source": [ "# 2.2 Impute missing values\n", - "df_imputed = md_handler.apply_imputation(df, missingness_dict)\n", + "real_df = md_handler.apply_imputation(df, missingness_dict)\n", "\n", - "print(df_imputed.isnull().sum())" + "print(\"Missing data:\")\n", + "print(real_df.isnull().sum())" ] }, { @@ -224,7 +228,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processed Data:\n" + "Processed data:\n" ] }, { @@ -262,7 +266,7 @@ " 0\n", " 0.503625\n", " 3\n", - " -0.480608\n", + " -0.517232\n", " 4\n", " 0\n", " \n", @@ -271,7 +275,7 @@ " 1\n", " -1.495187\n", " 4\n", - " -0.834521\n", + " -0.898113\n", " 3\n", " 0\n", " \n", @@ -280,7 +284,7 @@ " 0\n", " -1.603231\n", " 4\n", - " NaN\n", + " 0.000000\n", " 4\n", " 0\n", " \n", @@ -289,7 +293,7 @@ " 0\n", " 1.638086\n", " 5\n", - " -0.401961\n", + " -0.432591\n", " 1\n", " 0\n", " \n", @@ -298,7 +302,7 @@ " 0\n", " 0.341559\n", " 3\n", - " 0.069923\n", + " 0.075251\n", " 3\n", " 1\n", " \n", @@ -308,11 +312,11 @@ ], "text/plain": [ " sex age marital income ls smoke\n", - "0 0 0.503625 3 -0.480608 4 0\n", - "1 1 -1.495187 4 -0.834521 3 0\n", - "2 0 -1.603231 4 NaN 4 0\n", - "3 0 1.638086 5 -0.401961 1 0\n", - "4 0 0.341559 3 0.069923 3 1" + "0 0 0.503625 3 -0.517232 4 0\n", + "1 1 -1.495187 4 -0.898113 3 0\n", + "2 0 -1.603231 4 0.000000 4 0\n", + "3 0 1.638086 5 -0.432591 1 0\n", + "4 0 0.341559 3 0.075251 3 1" ] }, "metadata": {}, @@ -320,12 +324,12 @@ } ], "source": [ - "# 3. Instantiate the DataProcessor with column_dtypes\n", + "# 3. Preprocessing: Instantiate the DataProcessor with column_dtypes\n", "processor = DataProcessor(metadata)\n", "\n", "# 3.1 Preprocess the data: transforms raw data into a numerical format\n", - "processed_data = processor.preprocess(df)\n", - "print(\"Processed Data:\")\n", + "processed_data = processor.preprocess(real_df)\n", + "print(\"Processed data:\")\n", "display(processed_data.head())" ] }, @@ -333,15 +337,7 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR:synthpop.method.cart:Error fitting model for column 'income': Input y contains NaN.\n" - ] - } - ], + "outputs": [], "source": [ "# 4. Fit the CART method\n", "cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)\n", @@ -350,14 +346,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Synthetic Processed Data:\n" + "Synthetic processed data:\n" ] }, { @@ -384,6 +380,7 @@ " sex\n", " age\n", " marital\n", + " income\n", " ls\n", " smoke\n", " \n", @@ -391,42 +388,47 @@ " \n", " \n", " 0\n", - " 0\n", - " -1.123252\n", + " 1\n", + " -0.716885\n", " 4\n", + " -1.189097\n", " 2\n", " 0\n", " \n", " \n", " 1\n", " 1\n", - " 0.704909\n", + " -1.066729\n", " 3\n", + " 0.057878\n", " 4\n", " 1\n", " \n", " \n", " 2\n", " 0\n", - " 1.583713\n", - " 5\n", + " 1.552391\n", " 3\n", + " -0.754037\n", + " 2\n", " 0\n", " \n", " \n", " 3\n", " 0\n", - " -0.127991\n", + " 0.522026\n", " 3\n", + " 0.337329\n", " 4\n", " 1\n", " \n", " \n", " 4\n", " 0\n", - " 0.868010\n", + " 0.262577\n", + " 3\n", + " -1.179427\n", " 3\n", - " 4\n", " 0\n", " \n", " \n", @@ -434,12 +436,12 @@ "" ], "text/plain": [ - " sex age marital ls smoke\n", - "0 0 -1.123252 4 2 0\n", - "1 1 0.704909 3 4 1\n", - "2 0 1.583713 5 3 0\n", - "3 0 -0.127991 3 4 1\n", - "4 0 0.868010 3 4 0" + " sex age marital income ls smoke\n", + "0 1 -0.716885 4 -1.189097 2 0\n", + "1 1 -1.066729 3 0.057878 4 1\n", + "2 0 1.552391 3 -0.754037 2 0\n", + "3 0 0.522026 3 0.337329 4 1\n", + "4 0 0.262577 3 -1.179427 3 0" ] }, "metadata": {}, @@ -449,41 +451,124 @@ "source": [ "# 4.1 Preview generated synthetic data\n", "synthetic_processed = cart.sample(100)\n", - "print(\"Synthetic Processed Data:\")\n", + "print(\"Synthetic processed data:\")\n", "display(synthetic_processed.head())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "\"None of [Index(['income'], dtype='object')] are in the [columns]\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# 4.2 Postprocess the synthetic data back to the original format\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m synthetic_data \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpostprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43msynthetic_processed\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSynthetic Data in Original Format:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m display(synthetic_data\u001b[38;5;241m.\u001b[39mhead())\n", - "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/synthpop/processor/data_processor.py:90\u001b[0m, in \u001b[0;36mDataProcessor.postprocess\u001b[0;34m(self, synthetic_data)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumerical\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscalers:\n\u001b[1;32m 89\u001b[0m scaler \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscalers[col]\n\u001b[0;32m---> 90\u001b[0m synthetic_data[col] \u001b[38;5;241m=\u001b[39m scaler\u001b[38;5;241m.\u001b[39minverse_transform(\u001b[43msynthetic_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcol\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboolean\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 93\u001b[0m synthetic_data[col] \u001b[38;5;241m=\u001b[39m synthetic_data[col]\u001b[38;5;241m.\u001b[39mround()\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mbool\u001b[39m)\n", - "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4107\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4108\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", - "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", - "File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/indexes/base.py:6249\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing:\n\u001b[1;32m 6248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m nmissing \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(indexer):\n\u001b[0;32m-> 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[1;32m 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['income'], dtype='object')] are in the [columns]\"" + "name": "stdout", + "output_type": "stream", + "text": [ + "Synthetic data in original format:\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sexagemaritalincomelssmoke
0MALE34.407146SINGLE6.211859MOSTLY DISSATISFIEDNO
1MALE27.931190MARRIED1479.474304PLEASEDYES
2FEMALE76.413698MARRIED520.222172MOSTLY DISSATISFIEDNO
3FEMALE57.340625MARRIED1809.637340PLEASEDYES
4FEMALE52.537967MARRIED17.637157MOSTLY SATISFIEDNO
\n", + "
" + ], + "text/plain": [ + " sex age marital income ls smoke\n", + "0 MALE 34.407146 SINGLE 6.211859 MOSTLY DISSATISFIED NO\n", + "1 MALE 27.931190 MARRIED 1479.474304 PLEASED YES\n", + "2 FEMALE 76.413698 MARRIED 520.222172 MOSTLY DISSATISFIED NO\n", + "3 FEMALE 57.340625 MARRIED 1809.637340 PLEASED YES\n", + "4 FEMALE 52.537967 MARRIED 17.637157 MOSTLY SATISFIED NO" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "# 4.2 Postprocess the synthetic data back to the original format and give preview of generated synthetic data\n", - "synthetic_data = processor.postprocess(synthetic_processed)\n", - "print(\"Synthetic Data in Original Format:\")\n", - "display(synthetic_data.head())" + "# 5. Postprocessing: back to the original format and preview of data\n", + "synthetic_df = processor.postprocess(synthetic_processed)\n", + "print(\"Synthetic data in original format:\")\n", + "display(synthetic_df.head())" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -493,6 +578,259 @@ " DisclosureProtection\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Diagnostic Report ===\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columntypemissing_value_similarityrange_coverageboundary_adherenceks_complementtv_complementstatistic_similaritycategory_coveragecategory_adherence
0sexcategorical1.0N/AN/AN/A0.9764N/A1.01.0
1agenumerical1.00.947571.00.9142N/A0.962239N/AN/A
2maritalcategorical1.0N/AN/AN/A0.967N/A0.6666671.0
3incomenumerical1.00.4089261.00.9056N/A0.948719N/AN/A
4lscategorical1.0N/AN/AN/A0.9224N/A0.8571431.0
5smokecategorical1.0N/AN/AN/A0.9754N/A1.01.0
\n", + "
" + ], + "text/plain": [ + " column type missing_value_similarity range_coverage \\\n", + "0 sex categorical 1.0 N/A \n", + "1 age numerical 1.0 0.94757 \n", + "2 marital categorical 1.0 N/A \n", + "3 income numerical 1.0 0.408926 \n", + "4 ls categorical 1.0 N/A \n", + "5 smoke categorical 1.0 N/A \n", + "\n", + " boundary_adherence ks_complement tv_complement statistic_similarity \\\n", + "0 N/A N/A 0.9764 N/A \n", + "1 1.0 0.9142 N/A 0.962239 \n", + "2 N/A N/A 0.967 N/A \n", + "3 1.0 0.9056 N/A 0.948719 \n", + "4 N/A N/A 0.9224 N/A \n", + "5 N/A N/A 0.9754 N/A \n", + "\n", + " category_coverage category_adherence \n", + "0 1.0 1.0 \n", + "1 N/A N/A \n", + "2 0.666667 1.0 \n", + "3 N/A N/A \n", + "4 0.857143 1.0 \n", + "5 1.0 1.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 6. Evaluate the synthetic data\n", + "\n", + "# 6.1 Diagnostic report\n", + "report = MetricsReport(real_df, synthetic_df, metadata)\n", + "report_df = report.generate_report()\n", + "print(\"=== Diagnostic Report ===\")\n", + "display(report_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: 'MALE'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/v8/64zc34sn3b95klfm660085h80000gn/T/ipykernel_9335/3414886545.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# 6.2 Efficacy metrics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# regression\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mreg_efficacy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEfficacyMetrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'regression'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_column\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"income\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mreg_metrics\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreg_efficacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreal_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msynthetic_df\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"=== Regression Efficacy Metrics ===\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreg_metrics\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/synthpop/metrics/efficacy_metrics.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, real_df, synthetic_df)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;31m# Model Training and Evaluation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtask\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'regression'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_syn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_syn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_real\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mmse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmean_squared_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_real\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0mmae\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmean_absolute_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_real\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1470\u001b[0m skip_parameter_validation=(\n\u001b[1;32m 1471\u001b[0m \u001b[0mprefer_skip_nested_validation\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mglobal_skip_validation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1472\u001b[0m )\n\u001b[1;32m 1473\u001b[0m ):\n\u001b[0;32m-> 1474\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfit_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 574\u001b[0m \u001b[0mn_jobs_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[0maccept_sparse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpositive\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"csr\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"coo\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 577\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 578\u001b[0;31m X, y = self._validate_data(\n\u001b[0m\u001b[1;32m 579\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_numeric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m )\n\u001b[1;32m 581\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 650\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ensure_2d\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1259\u001b[0m raise ValueError(\n\u001b[1;32m 1260\u001b[0m \u001b[0;34mf\"{estimator_name} requires y to be passed, but the target y is None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1261\u001b[0m )\n\u001b[1;32m 1262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1263\u001b[0;31m X = check_array(\n\u001b[0m\u001b[1;32m 1264\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1265\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1266\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 994\u001b[0m )\n\u001b[1;32m 995\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 996\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 997\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 998\u001b[0;31m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 999\u001b[0m raise ValueError(\n\u001b[1;32m 1000\u001b[0m \u001b[0;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1001\u001b[0m ) from complex_warning\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/_array_api.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[0;31m# Use NumPy API to support order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 2149\u001b[0m def __array__(\n\u001b[1;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2151\u001b[0m ) -> np.ndarray:\n\u001b[1;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2153\u001b[0;31m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2154\u001b[0m if (\n\u001b[1;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2156\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'MALE'" + ] + } + ], + "source": [ + "# 6.2 Efficacy metrics\n", + "\n", + "# regression\n", + "reg_efficacy = EfficacyMetrics(task='regression', target_column=\"income\")\n", + "reg_metrics = reg_efficacy.evaluate(real_df, synthetic_df)\n", + "print(\"=== Regression Efficacy Metrics ===\")\n", + "print(reg_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Classification Efficacy Metrics ===\n", + "{'accuracy': 0.6392, 'f1_score': 0.6481509447474609}\n" + ] + } + ], + "source": [ + "# classification\n", + "clf_efficacy = EfficacyMetrics(task='classification', target_column=\"smoke\")\n", + "clf_metrics = clf_efficacy.evaluate(real_df, synthetic_df)\n", + "print(\"\\n=== Classification Efficacy Metrics ===\")\n", + "print(clf_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Disclosure Protection ===\n", + "Score: 1.000\n", + "Detailed Report: {'threshold': 0.0, 'risk_rate': 0.0, 'disclosure_protection_score': 1.0}\n" + ] + } + ], + "source": [ + "# privacy\n", + "dp = DisclosureProtection(real_df, synthetic_df)\n", + "dp_score = dp.score()\n", + "dp_report = dp.report()\n", + "\n", + "print(\"\\n=== Disclosure Protection ===\")\n", + "print(f\"Score: {dp_score:.3f}\")\n", + "print(\"Detailed Report:\", dp_report)" + ] } ], "metadata": {