From 144b93eb8a60e79340ab88fe5996408d7bf4224b Mon Sep 17 00:00:00 2001 From: Emmanuel Jordy Menvouta <56538317+emmanueljordy@users.noreply.github.com> Date: Tue, 11 Mar 2025 12:02:05 +0100 Subject: [PATCH] update data processor and efficacy metrics --- synthpop/metrics/efficacy_metrics.py | 14 +++++++------- synthpop/processor/data_processor.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/synthpop/metrics/efficacy_metrics.py b/synthpop/metrics/efficacy_metrics.py index b87f9f4..d8287ae 100644 --- a/synthpop/metrics/efficacy_metrics.py +++ b/synthpop/metrics/efficacy_metrics.py @@ -76,15 +76,15 @@ def evaluate(self, real_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> dict: y_real = real_df[self.target_column] # Handle categorical encoding only if it's a classification task - if self.task == 'classification': - categorical_cols = X_syn.select_dtypes(include=['object', 'category']).columns.tolist() + + categorical_cols = X_syn.select_dtypes(include=['object', 'category']).columns.tolist() - if categorical_cols: - X_syn = pd.get_dummies(X_syn, columns=categorical_cols, drop_first=True) - X_real = pd.get_dummies(X_real, columns=categorical_cols, drop_first=True) + if categorical_cols: + X_syn = pd.get_dummies(X_syn, columns=categorical_cols, drop_first=True) + X_real = pd.get_dummies(X_real, columns=categorical_cols, drop_first=True) - # Align columns in case of different categorical levels between real and synthetic data - X_syn, X_real = X_syn.align(X_real, join='left', axis=1, fill_value=0) + # Align columns in case of different categorical levels between real and synthetic data + X_syn, X_real = X_syn.align(X_real, join='left', axis=1, fill_value=0) # Model Training and Evaluation if self.task == 'regression': diff --git a/synthpop/processor/data_processor.py b/synthpop/processor/data_processor.py index c015b47..ee857ef 100644 --- a/synthpop/processor/data_processor.py +++ b/synthpop/processor/data_processor.py @@ -61,7 +61,7 @@ def _preprocess(self, data: pd.DataFrame) -> pd.DataFrame: data = pd.concat([data, transformed_data], axis=1) elif dtype == "numerical": - scaler = StandardScaler() + scaler = StandardScaler(with_mean= False, with_std= False) data[col] = scaler.fit_transform(data[[col]]) self.scalers[col] = scaler