diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0249621 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,60 @@ +repos: + # 1. Code Formatter: Black (Ensures uniform formatting) + - repo: https://github.com/psf/black + rev: stable + hooks: + - id: black + + # 2. Code Formatter: isort (Sorts imports) + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + + # 3. Linter: Flake8 (Finds style & syntax issues) + - repo: https://github.com/pycqa/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + + # 4. Security: Bandit (Finds security vulnerabilities) + - repo: https://github.com/PyCQA/bandit + rev: stable + hooks: + - id: bandit + args: ["-r", "."] + + # 5. Security: detect-secrets (Prevents committing secrets) + - repo: https://github.com/Yelp/detect-secrets + rev: v1.3.0 + hooks: + - id: detect-secrets-hook + + # 6. Type Checker: mypy (Checks for type errors) + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.0 + hooks: + - id: mypy + + # 7. Tests: Pytest (Runs test cases before commit) + - repo: local + hooks: + - id: pytest + name: Run Pytest + entry: pytest + language: system + types: [python] + + # 8. Dependency Check: pip-audit (Checks for vulnerable dependencies) + - repo: https://github.com/pypa/pip-audit + rev: v2.4.0 + hooks: + - id: pip-audit + + # 9. File Cleanup: Remove trailing whitespace + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + + diff --git a/.venv/Scripts/Activate.ps1 b/.venv/Scripts/Activate.ps1 new file mode 100644 index 0000000..b63e7b7 --- /dev/null +++ b/.venv/Scripts/Activate.ps1 @@ -0,0 +1,502 @@ +<# +.Synopsis +Activate a Python virtual environment for the current PowerShell session. + +.Description +Pushes the python executable for a virtual environment to the front of the +$Env:PATH environment variable and sets the prompt to signify that you are +in a Python virtual environment. Makes use of the command line switches as +well as the `pyvenv.cfg` file values present in the virtual environment. + +.Parameter VenvDir +Path to the directory that contains the virtual environment to activate. The +default value for this is the parent of the directory that the Activate.ps1 +script is located within. + +.Parameter Prompt +The prompt prefix to display when this virtual environment is activated. By +default, this prompt is the name of the virtual environment folder (VenvDir) +surrounded by parentheses and followed by a single space (ie. '(.venv) '). + +.Example +Activate.ps1 +Activates the Python virtual environment that contains the Activate.ps1 script. + +.Example +Activate.ps1 -Verbose +Activates the Python virtual environment that contains the Activate.ps1 script, +and shows extra information about the activation as it executes. + +.Example +Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv +Activates the Python virtual environment located in the specified location. + +.Example +Activate.ps1 -Prompt "MyPython" +Activates the Python virtual environment that contains the Activate.ps1 script, +and prefixes the current prompt with the specified string (surrounded in +parentheses) while the virtual environment is active. + +.Notes +On Windows, it may be required to enable this Activate.ps1 script by setting the +execution policy for the user. You can do this by issuing the following PowerShell +command: + +PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +For more information on Execution Policies: +https://go.microsoft.com/fwlink/?LinkID=135170 + +#> +Param( + [Parameter(Mandatory = $false)] + [String] + $VenvDir, + [Parameter(Mandatory = $false)] + [String] + $Prompt +) + +<# Function declarations --------------------------------------------------- #> + +<# +.Synopsis +Remove all shell session elements added by the Activate script, including the +addition of the virtual environment's Python executable from the beginning of +the PATH variable. + +.Parameter NonDestructive +If present, do not remove this function from the global namespace for the +session. + +#> +function global:deactivate ([switch]$NonDestructive) { + # Revert to original values + + # The prior prompt: + if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) { + Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt + Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT + } + + # The prior PYTHONHOME: + if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) { + Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME + Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME + } + + # The prior PATH: + if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) { + Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH + Remove-Item -Path Env:_OLD_VIRTUAL_PATH + } + + # Just remove the VIRTUAL_ENV altogether: + if (Test-Path -Path Env:VIRTUAL_ENV) { + Remove-Item -Path env:VIRTUAL_ENV + } + + # Just remove VIRTUAL_ENV_PROMPT altogether. + if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) { + Remove-Item -Path env:VIRTUAL_ENV_PROMPT + } + + # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether: + if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) { + Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force + } + + # Leave deactivate function in the global namespace if requested: + if (-not $NonDestructive) { + Remove-Item -Path function:deactivate + } +} + +<# +.Description +Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the +given folder, and returns them in a map. + +For each line in the pyvenv.cfg file, if that line can be parsed into exactly +two strings separated by `=` (with any amount of whitespace surrounding the =) +then it is considered a `key = value` line. The left hand string is the key, +the right hand is the value. + +If the value starts with a `'` or a `"` then the first and last character is +stripped from the value before being captured. + +.Parameter ConfigDir +Path to the directory that contains the `pyvenv.cfg` file. +#> +function Get-PyVenvConfig( + [String] + $ConfigDir +) { + Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg" + + # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue). + $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue + + # An empty map will be returned if no config file is found. + $pyvenvConfig = @{ } + + if ($pyvenvConfigPath) { + + Write-Verbose "File exists, parse `key = value` lines" + $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath + + $pyvenvConfigContent | ForEach-Object { + $keyval = $PSItem -split "\s*=\s*", 2 + if ($keyval[0] -and $keyval[1]) { + $val = $keyval[1] + + # Remove extraneous quotations around a string value. + if ("'""".Contains($val.Substring(0, 1))) { + $val = $val.Substring(1, $val.Length - 2) + } + + $pyvenvConfig[$keyval[0]] = $val + Write-Verbose "Adding Key: '$($keyval[0])'='$val'" + } + } + } + return $pyvenvConfig +} + + +<# Begin Activate script --------------------------------------------------- #> + +# Determine the containing directory of this script +$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition +$VenvExecDir = Get-Item -Path $VenvExecPath + +Write-Verbose "Activation script is located in path: '$VenvExecPath'" +Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)" +Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)" + +# Set values required in priority: CmdLine, ConfigFile, Default +# First, get the location of the virtual environment, it might not be +# VenvExecDir if specified on the command line. +if ($VenvDir) { + Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values" +} +else { + Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir." + $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/") + Write-Verbose "VenvDir=$VenvDir" +} + +# Next, read the `pyvenv.cfg` file to determine any required value such +# as `prompt`. +$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir + +# Next, set the prompt from the command line, or the config file, or +# just use the name of the virtual environment folder. +if ($Prompt) { + Write-Verbose "Prompt specified as argument, using '$Prompt'" +} +else { + Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value" + if ($pyvenvCfg -and $pyvenvCfg['prompt']) { + Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'" + $Prompt = $pyvenvCfg['prompt']; + } + else { + Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)" + Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'" + $Prompt = Split-Path -Path $venvDir -Leaf + } +} + +Write-Verbose "Prompt = '$Prompt'" +Write-Verbose "VenvDir='$VenvDir'" + +# Deactivate any currently active virtual environment, but leave the +# deactivate function in place. +deactivate -nondestructive + +# Now set the environment variable VIRTUAL_ENV, used by many tools to determine +# that there is an activated venv. +$env:VIRTUAL_ENV = $VenvDir + +if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) { + + Write-Verbose "Setting prompt to '$Prompt'" + + # Set the prompt to include the env name + # Make sure _OLD_VIRTUAL_PROMPT is global + function global:_OLD_VIRTUAL_PROMPT { "" } + Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT + New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt + + function global:prompt { + Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) " + _OLD_VIRTUAL_PROMPT + } + $env:VIRTUAL_ENV_PROMPT = $Prompt +} + +# Clear PYTHONHOME +if (Test-Path -Path Env:PYTHONHOME) { + Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME + Remove-Item -Path Env:PYTHONHOME +} + +# Add the venv to the PATH +Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH +$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH" + +# SIG # Begin signature block +# MIIvIwYJKoZIhvcNAQcCoIIvFDCCLxACAQExDzANBglghkgBZQMEAgEFADB5Bgor +# BgEEAYI3AgEEoGswaTA0BgorBgEEAYI3AgEeMCYCAwEAAAQQH8w7YFlLCE63JNLG +# KX7zUQIBAAIBAAIBAAIBAAIBADAxMA0GCWCGSAFlAwQCAQUABCBnL745ElCYk8vk +# dBtMuQhLeWJ3ZGfzKW4DHCYzAn+QB6CCE8MwggWQMIIDeKADAgECAhAFmxtXno4h +# MuI5B72nd3VcMA0GCSqGSIb3DQEBDAUAMGIxCzAJBgNVBAYTAlVTMRUwEwYDVQQK +# EwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xITAfBgNV +# BAMTGERpZ2lDZXJ0IFRydXN0ZWQgUm9vdCBHNDAeFw0xMzA4MDExMjAwMDBaFw0z +# ODAxMTUxMjAwMDBaMGIxCzAJBgNVBAYTAlVTMRUwEwYDVQQKEwxEaWdpQ2VydCBJ +# bmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xITAfBgNVBAMTGERpZ2lDZXJ0 +# IFRydXN0ZWQgUm9vdCBHNDCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIB +# AL/mkHNo3rvkXUo8MCIwaTPswqclLskhPfKK2FnC4SmnPVirdprNrnsbhA3EMB/z +# G6Q4FutWxpdtHauyefLKEdLkX9YFPFIPUh/GnhWlfr6fqVcWWVVyr2iTcMKyunWZ +# anMylNEQRBAu34LzB4TmdDttceItDBvuINXJIB1jKS3O7F5OyJP4IWGbNOsFxl7s +# Wxq868nPzaw0QF+xembud8hIqGZXV59UWI4MK7dPpzDZVu7Ke13jrclPXuU15zHL +# 2pNe3I6PgNq2kZhAkHnDeMe2scS1ahg4AxCN2NQ3pC4FfYj1gj4QkXCrVYJBMtfb +# BHMqbpEBfCFM1LyuGwN1XXhm2ToxRJozQL8I11pJpMLmqaBn3aQnvKFPObURWBf3 +# JFxGj2T3wWmIdph2PVldQnaHiZdpekjw4KISG2aadMreSx7nDmOu5tTvkpI6nj3c +# AORFJYm2mkQZK37AlLTSYW3rM9nF30sEAMx9HJXDj/chsrIRt7t/8tWMcCxBYKqx +# YxhElRp2Yn72gLD76GSmM9GJB+G9t+ZDpBi4pncB4Q+UDCEdslQpJYls5Q5SUUd0 +# viastkF13nqsX40/ybzTQRESW+UQUOsxxcpyFiIJ33xMdT9j7CFfxCBRa2+xq4aL +# T8LWRV+dIPyhHsXAj6KxfgommfXkaS+YHS312amyHeUbAgMBAAGjQjBAMA8GA1Ud +# EwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgGGMB0GA1UdDgQWBBTs1+OC0nFdZEzf +# Lmc/57qYrhwPTzANBgkqhkiG9w0BAQwFAAOCAgEAu2HZfalsvhfEkRvDoaIAjeNk +# aA9Wz3eucPn9mkqZucl4XAwMX+TmFClWCzZJXURj4K2clhhmGyMNPXnpbWvWVPjS +# PMFDQK4dUPVS/JA7u5iZaWvHwaeoaKQn3J35J64whbn2Z006Po9ZOSJTROvIXQPK +# 7VB6fWIhCoDIc2bRoAVgX+iltKevqPdtNZx8WorWojiZ83iL9E3SIAveBO6Mm0eB +# cg3AFDLvMFkuruBx8lbkapdvklBtlo1oepqyNhR6BvIkuQkRUNcIsbiJeoQjYUIp +# 5aPNoiBB19GcZNnqJqGLFNdMGbJQQXE9P01wI4YMStyB0swylIQNCAmXHE/A7msg +# dDDS4Dk0EIUhFQEI6FUy3nFJ2SgXUE3mvk3RdazQyvtBuEOlqtPDBURPLDab4vri +# RbgjU2wGb2dVf0a1TD9uKFp5JtKkqGKX0h7i7UqLvBv9R0oN32dmfrJbQdA75PQ7 +# 9ARj6e/CVABRoIoqyc54zNXqhwQYs86vSYiv85KZtrPmYQ/ShQDnUBrkG5WdGaG5 +# nLGbsQAe79APT0JsyQq87kP6OnGlyE0mpTX9iV28hWIdMtKgK1TtmlfB2/oQzxm3 +# i0objwG2J5VT6LaJbVu8aNQj6ItRolb58KaAoNYes7wPD1N1KarqE3fk3oyBIa0H +# EEcRrYc9B9F1vM/zZn4wggawMIIEmKADAgECAhAIrUCyYNKcTJ9ezam9k67ZMA0G +# CSqGSIb3DQEBDAUAMGIxCzAJBgNVBAYTAlVTMRUwEwYDVQQKEwxEaWdpQ2VydCBJ +# bmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xITAfBgNVBAMTGERpZ2lDZXJ0 +# IFRydXN0ZWQgUm9vdCBHNDAeFw0yMTA0MjkwMDAwMDBaFw0zNjA0MjgyMzU5NTla +# MGkxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdpQ2VydCwgSW5jLjFBMD8GA1UE +# AxM4RGlnaUNlcnQgVHJ1c3RlZCBHNCBDb2RlIFNpZ25pbmcgUlNBNDA5NiBTSEEz +# ODQgMjAyMSBDQTEwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDVtC9C +# 0CiteLdd1TlZG7GIQvUzjOs9gZdwxbvEhSYwn6SOaNhc9es0JAfhS0/TeEP0F9ce +# 2vnS1WcaUk8OoVf8iJnBkcyBAz5NcCRks43iCH00fUyAVxJrQ5qZ8sU7H/Lvy0da +# E6ZMswEgJfMQ04uy+wjwiuCdCcBlp/qYgEk1hz1RGeiQIXhFLqGfLOEYwhrMxe6T +# SXBCMo/7xuoc82VokaJNTIIRSFJo3hC9FFdd6BgTZcV/sk+FLEikVoQ11vkunKoA +# FdE3/hoGlMJ8yOobMubKwvSnowMOdKWvObarYBLj6Na59zHh3K3kGKDYwSNHR7Oh +# D26jq22YBoMbt2pnLdK9RBqSEIGPsDsJ18ebMlrC/2pgVItJwZPt4bRc4G/rJvmM +# 1bL5OBDm6s6R9b7T+2+TYTRcvJNFKIM2KmYoX7BzzosmJQayg9Rc9hUZTO1i4F4z +# 8ujo7AqnsAMrkbI2eb73rQgedaZlzLvjSFDzd5Ea/ttQokbIYViY9XwCFjyDKK05 +# huzUtw1T0PhH5nUwjewwk3YUpltLXXRhTT8SkXbev1jLchApQfDVxW0mdmgRQRNY +# mtwmKwH0iU1Z23jPgUo+QEdfyYFQc4UQIyFZYIpkVMHMIRroOBl8ZhzNeDhFMJlP +# /2NPTLuqDQhTQXxYPUez+rbsjDIJAsxsPAxWEQIDAQABo4IBWTCCAVUwEgYDVR0T +# AQH/BAgwBgEB/wIBADAdBgNVHQ4EFgQUaDfg67Y7+F8Rhvv+YXsIiGX0TkIwHwYD +# VR0jBBgwFoAU7NfjgtJxXWRM3y5nP+e6mK4cD08wDgYDVR0PAQH/BAQDAgGGMBMG +# A1UdJQQMMAoGCCsGAQUFBwMDMHcGCCsGAQUFBwEBBGswaTAkBggrBgEFBQcwAYYY +# aHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEEGCCsGAQUFBzAChjVodHRwOi8vY2Fj +# ZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRUcnVzdGVkUm9vdEc0LmNydDBDBgNV +# HR8EPDA6MDigNqA0hjJodHRwOi8vY3JsMy5kaWdpY2VydC5jb20vRGlnaUNlcnRU +# cnVzdGVkUm9vdEc0LmNybDAcBgNVHSAEFTATMAcGBWeBDAEDMAgGBmeBDAEEATAN +# BgkqhkiG9w0BAQwFAAOCAgEAOiNEPY0Idu6PvDqZ01bgAhql+Eg08yy25nRm95Ry +# sQDKr2wwJxMSnpBEn0v9nqN8JtU3vDpdSG2V1T9J9Ce7FoFFUP2cvbaF4HZ+N3HL +# IvdaqpDP9ZNq4+sg0dVQeYiaiorBtr2hSBh+3NiAGhEZGM1hmYFW9snjdufE5Btf +# Q/g+lP92OT2e1JnPSt0o618moZVYSNUa/tcnP/2Q0XaG3RywYFzzDaju4ImhvTnh +# OE7abrs2nfvlIVNaw8rpavGiPttDuDPITzgUkpn13c5UbdldAhQfQDN8A+KVssIh +# dXNSy0bYxDQcoqVLjc1vdjcshT8azibpGL6QB7BDf5WIIIJw8MzK7/0pNVwfiThV +# 9zeKiwmhywvpMRr/LhlcOXHhvpynCgbWJme3kuZOX956rEnPLqR0kq3bPKSchh/j +# wVYbKyP/j7XqiHtwa+aguv06P0WmxOgWkVKLQcBIhEuWTatEQOON8BUozu3xGFYH +# Ki8QxAwIZDwzj64ojDzLj4gLDb879M4ee47vtevLt/B3E+bnKD+sEq6lLyJsQfmC +# XBVmzGwOysWGw/YmMwwHS6DTBwJqakAwSEs0qFEgu60bhQjiWQ1tygVQK+pKHJ6l +# /aCnHwZ05/LWUpD9r4VIIflXO7ScA+2GRfS0YW6/aOImYIbqyK+p/pQd52MbOoZW +# eE4wggd3MIIFX6ADAgECAhAHHxQbizANJfMU6yMM0NHdMA0GCSqGSIb3DQEBCwUA +# MGkxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdpQ2VydCwgSW5jLjFBMD8GA1UE +# AxM4RGlnaUNlcnQgVHJ1c3RlZCBHNCBDb2RlIFNpZ25pbmcgUlNBNDA5NiBTSEEz +# ODQgMjAyMSBDQTEwHhcNMjIwMTE3MDAwMDAwWhcNMjUwMTE1MjM1OTU5WjB8MQsw +# CQYDVQQGEwJVUzEPMA0GA1UECBMGT3JlZ29uMRIwEAYDVQQHEwlCZWF2ZXJ0b24x +# IzAhBgNVBAoTGlB5dGhvbiBTb2Z0d2FyZSBGb3VuZGF0aW9uMSMwIQYDVQQDExpQ +# eXRob24gU29mdHdhcmUgRm91bmRhdGlvbjCCAiIwDQYJKoZIhvcNAQEBBQADggIP +# ADCCAgoCggIBAKgc0BTT+iKbtK6f2mr9pNMUTcAJxKdsuOiSYgDFfwhjQy89koM7 +# uP+QV/gwx8MzEt3c9tLJvDccVWQ8H7mVsk/K+X+IufBLCgUi0GGAZUegEAeRlSXx +# xhYScr818ma8EvGIZdiSOhqjYc4KnfgfIS4RLtZSrDFG2tN16yS8skFa3IHyvWdb +# D9PvZ4iYNAS4pjYDRjT/9uzPZ4Pan+53xZIcDgjiTwOh8VGuppxcia6a7xCyKoOA +# GjvCyQsj5223v1/Ig7Dp9mGI+nh1E3IwmyTIIuVHyK6Lqu352diDY+iCMpk9Zanm +# SjmB+GMVs+H/gOiofjjtf6oz0ki3rb7sQ8fTnonIL9dyGTJ0ZFYKeb6BLA66d2GA +# LwxZhLe5WH4Np9HcyXHACkppsE6ynYjTOd7+jN1PRJahN1oERzTzEiV6nCO1M3U1 +# HbPTGyq52IMFSBM2/07WTJSbOeXjvYR7aUxK9/ZkJiacl2iZI7IWe7JKhHohqKuc +# eQNyOzxTakLcRkzynvIrk33R9YVqtB4L6wtFxhUjvDnQg16xot2KVPdfyPAWd81w +# tZADmrUtsZ9qG79x1hBdyOl4vUtVPECuyhCxaw+faVjumapPUnwo8ygflJJ74J+B +# Yxf6UuD7m8yzsfXWkdv52DjL74TxzuFTLHPyARWCSCAbzn3ZIly+qIqDAgMBAAGj +# ggIGMIICAjAfBgNVHSMEGDAWgBRoN+Drtjv4XxGG+/5hewiIZfROQjAdBgNVHQ4E +# FgQUt/1Teh2XDuUj2WW3siYWJgkZHA8wDgYDVR0PAQH/BAQDAgeAMBMGA1UdJQQM +# MAoGCCsGAQUFBwMDMIG1BgNVHR8Ega0wgaowU6BRoE+GTWh0dHA6Ly9jcmwzLmRp +# Z2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0ZWRHNENvZGVTaWduaW5nUlNBNDA5NlNI +# QTM4NDIwMjFDQTEuY3JsMFOgUaBPhk1odHRwOi8vY3JsNC5kaWdpY2VydC5jb20v +# RGlnaUNlcnRUcnVzdGVkRzRDb2RlU2lnbmluZ1JTQTQwOTZTSEEzODQyMDIxQ0Ex +# LmNybDA+BgNVHSAENzA1MDMGBmeBDAEEATApMCcGCCsGAQUFBwIBFhtodHRwOi8v +# d3d3LmRpZ2ljZXJ0LmNvbS9DUFMwgZQGCCsGAQUFBwEBBIGHMIGEMCQGCCsGAQUF +# BzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wXAYIKwYBBQUHMAKGUGh0dHA6 +# Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFRydXN0ZWRHNENvZGVTaWdu +# aW5nUlNBNDA5NlNIQTM4NDIwMjFDQTEuY3J0MAwGA1UdEwEB/wQCMAAwDQYJKoZI +# hvcNAQELBQADggIBABxv4AeV/5ltkELHSC63fXAFYS5tadcWTiNc2rskrNLrfH1N +# s0vgSZFoQxYBFKI159E8oQQ1SKbTEubZ/B9kmHPhprHya08+VVzxC88pOEvz68nA +# 82oEM09584aILqYmj8Pj7h/kmZNzuEL7WiwFa/U1hX+XiWfLIJQsAHBla0i7QRF2 +# de8/VSF0XXFa2kBQ6aiTsiLyKPNbaNtbcucaUdn6vVUS5izWOXM95BSkFSKdE45O +# q3FForNJXjBvSCpwcP36WklaHL+aHu1upIhCTUkzTHMh8b86WmjRUqbrnvdyR2yd +# I5l1OqcMBjkpPpIV6wcc+KY/RH2xvVuuoHjlUjwq2bHiNoX+W1scCpnA8YTs2d50 +# jDHUgwUo+ciwpffH0Riq132NFmrH3r67VaN3TuBxjI8SIZM58WEDkbeoriDk3hxU +# 8ZWV7b8AW6oyVBGfM06UgkfMb58h+tJPrFx8VI/WLq1dTqMfZOm5cuclMnUHs2uq +# rRNtnV8UfidPBL4ZHkTcClQbCoz0UbLhkiDvIS00Dn+BBcxw/TKqVL4Oaz3bkMSs +# M46LciTeucHY9ExRVt3zy7i149sd+F4QozPqn7FrSVHXmem3r7bjyHTxOgqxRCVa +# 18Vtx7P/8bYSBeS+WHCKcliFCecspusCDSlnRUjZwyPdP0VHxaZg2unjHY3rMYIa +# tjCCGrICAQEwfTBpMQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIElu +# Yy4xQTA/BgNVBAMTOERpZ2lDZXJ0IFRydXN0ZWQgRzQgQ29kZSBTaWduaW5nIFJT +# QTQwOTYgU0hBMzg0IDIwMjEgQ0ExAhAHHxQbizANJfMU6yMM0NHdMA0GCWCGSAFl +# AwQCAQUAoIHIMBkGCSqGSIb3DQEJAzEMBgorBgEEAYI3AgEEMBwGCisGAQQBgjcC +# AQsxDjAMBgorBgEEAYI3AgEVMC8GCSqGSIb3DQEJBDEiBCBnAZ6P7YvTwq0fbF62 +# o7E75R0LxsW5OtyYiFESQckLhjBcBgorBgEEAYI3AgEMMU4wTKBGgEQAQgB1AGkA +# bAB0ADoAIABSAGUAbABlAGEAcwBlAF8AdgAzAC4AMQAyAC4ANABfADIAMAAyADQA +# MAA2ADAANgAuADAAMaECgAAwDQYJKoZIhvcNAQEBBQAEggIAV29hYhi09QNyGtav +# HZIo33y/iqXsIa4o88S5gzBa7Nnkwra0QLitSjvRfVbcFvq54Id+VIn00di4Nde0 +# maAUKPGXtTQL48esG/F/TLDOWd/jb9qCYHyNZYpJjKdXqI8IbyG6Pl05IMSas7wX +# DHsK19ZEGuGrmKCAxh6JbFXADgeUbftg3i9UxpMnfSugZjjdKIdyVWlzUnpYkKuI +# fpafwvNHfIYzfxOeV9CWsdqe34D6fRrEs8ZDEZSQl+Mw9aGaT39vuryFE1iKOzj0 +# uqrX/wN/wwu8oLWNC7JWE8SDG3eD0QLy+x7zEnlPkWsRV9nGOgrP9Khge0LgL+jP +# Km8iDs7fSGEOB/7PPxAl8yshEULOZAhBhcsGeGs+kQrVzlqZ9WlrU1Z1cylpLWzX +# Kkvs2DXD+zrplhpiVv6Gnn3YMBr4BKf0mXESTX9/BzIwvxlkhpv/BT0OWwrDlgPM +# hNj8jA5r2/WSqCg15DYjJ0RlnCerC/ORhSbs7v/HjpmH3DhaICJF7tdyFSIFXgNV +# W0GyQJMulQDEPd2+o+PNyAPElvGC3SYTjVnRLPcJTGhAt+VuHfnMG4HNkmyeU+nk +# OAMShxEax6NLeRsjKqqABUgZb2g4FSmXzHy7HgQOPmCQMv8xH4m8u992YMLyxh5U +# gGRUOUiAhrHXNZ6wG6T52NGQppehghc/MIIXOwYKKwYBBAGCNwMDATGCFyswghcn +# BgkqhkiG9w0BBwKgghcYMIIXFAIBAzEPMA0GCWCGSAFlAwQCAQUAMHcGCyqGSIb3 +# DQEJEAEEoGgEZjBkAgEBBglghkgBhv1sBwEwMTANBglghkgBZQMEAgEFAAQg+eJt +# Pwl5Hz89rrpf2qbsjNAUNlBq9SGjVuw+Erci2HcCEDPjoeI//+uRP30fqUoeHIAY +# DzIwMjQwNjA2MTk1MDE0WqCCEwkwggbCMIIEqqADAgECAhAFRK/zlJ0IOaa/2z9f +# 5WEWMA0GCSqGSIb3DQEBCwUAMGMxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdp +# Q2VydCwgSW5jLjE7MDkGA1UEAxMyRGlnaUNlcnQgVHJ1c3RlZCBHNCBSU0E0MDk2 +# IFNIQTI1NiBUaW1lU3RhbXBpbmcgQ0EwHhcNMjMwNzE0MDAwMDAwWhcNMzQxMDEz +# MjM1OTU5WjBIMQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIEluYy4x +# IDAeBgNVBAMTF0RpZ2lDZXJ0IFRpbWVzdGFtcCAyMDIzMIICIjANBgkqhkiG9w0B +# AQEFAAOCAg8AMIICCgKCAgEAo1NFhx2DjlusPlSzI+DPn9fl0uddoQ4J3C9Io5d6 +# OyqcZ9xiFVjBqZMRp82qsmrdECmKHmJjadNYnDVxvzqX65RQjxwg6seaOy+WZuNp +# 52n+W8PWKyAcwZeUtKVQgfLPywemMGjKg0La/H8JJJSkghraarrYO8pd3hkYhftF +# 6g1hbJ3+cV7EBpo88MUueQ8bZlLjyNY+X9pD04T10Mf2SC1eRXWWdf7dEKEbg8G4 +# 5lKVtUfXeCk5a+B4WZfjRCtK1ZXO7wgX6oJkTf8j48qG7rSkIWRw69XloNpjsy7p +# Be6q9iT1HbybHLK3X9/w7nZ9MZllR1WdSiQvrCuXvp/k/XtzPjLuUjT71Lvr1KAs +# NJvj3m5kGQc3AZEPHLVRzapMZoOIaGK7vEEbeBlt5NkP4FhB+9ixLOFRr7StFQYU +# 6mIIE9NpHnxkTZ0P387RXoyqq1AVybPKvNfEO2hEo6U7Qv1zfe7dCv95NBB+plwK +# WEwAPoVpdceDZNZ1zY8SdlalJPrXxGshuugfNJgvOuprAbD3+yqG7HtSOKmYCaFx +# smxxrz64b5bV4RAT/mFHCoz+8LbH1cfebCTwv0KCyqBxPZySkwS0aXAnDU+3tTbR +# yV8IpHCj7ArxES5k4MsiK8rxKBMhSVF+BmbTO77665E42FEHypS34lCh8zrTioPL +# QHsCAwEAAaOCAYswggGHMA4GA1UdDwEB/wQEAwIHgDAMBgNVHRMBAf8EAjAAMBYG +# A1UdJQEB/wQMMAoGCCsGAQUFBwMIMCAGA1UdIAQZMBcwCAYGZ4EMAQQCMAsGCWCG +# SAGG/WwHATAfBgNVHSMEGDAWgBS6FtltTYUvcyl2mi91jGogj57IbzAdBgNVHQ4E +# FgQUpbbvE+fvzdBkodVWqWUxo97V40kwWgYDVR0fBFMwUTBPoE2gS4ZJaHR0cDov +# L2NybDMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0VHJ1c3RlZEc0UlNBNDA5NlNIQTI1 +# NlRpbWVTdGFtcGluZ0NBLmNybDCBkAYIKwYBBQUHAQEEgYMwgYAwJAYIKwYBBQUH +# MAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBYBggrBgEFBQcwAoZMaHR0cDov +# L2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0VHJ1c3RlZEc0UlNBNDA5NlNI +# QTI1NlRpbWVTdGFtcGluZ0NBLmNydDANBgkqhkiG9w0BAQsFAAOCAgEAgRrW3qCp +# tZgXvHCNT4o8aJzYJf/LLOTN6l0ikuyMIgKpuM+AqNnn48XtJoKKcS8Y3U623mzX +# 4WCcK+3tPUiOuGu6fF29wmE3aEl3o+uQqhLXJ4Xzjh6S2sJAOJ9dyKAuJXglnSoF +# eoQpmLZXeY/bJlYrsPOnvTcM2Jh2T1a5UsK2nTipgedtQVyMadG5K8TGe8+c+nji +# kxp2oml101DkRBK+IA2eqUTQ+OVJdwhaIcW0z5iVGlS6ubzBaRm6zxbygzc0brBB +# Jt3eWpdPM43UjXd9dUWhpVgmagNF3tlQtVCMr1a9TMXhRsUo063nQwBw3syYnhmJ +# A+rUkTfvTVLzyWAhxFZH7doRS4wyw4jmWOK22z75X7BC1o/jF5HRqsBV44a/rCcs +# QdCaM0qoNtS5cpZ+l3k4SF/Kwtw9Mt911jZnWon49qfH5U81PAC9vpwqbHkB3NpE +# 5jreODsHXjlY9HxzMVWggBHLFAx+rrz+pOt5Zapo1iLKO+uagjVXKBbLafIymrLS +# 2Dq4sUaGa7oX/cR3bBVsrquvczroSUa31X/MtjjA2Owc9bahuEMs305MfR5ocMB3 +# CtQC4Fxguyj/OOVSWtasFyIjTvTs0xf7UGv/B3cfcZdEQcm4RtNsMnxYL2dHZeUb +# c7aZ+WssBkbvQR7w8F/g29mtkIBEr4AQQYowggauMIIElqADAgECAhAHNje3JFR8 +# 2Ees/ShmKl5bMA0GCSqGSIb3DQEBCwUAMGIxCzAJBgNVBAYTAlVTMRUwEwYDVQQK +# EwxEaWdpQ2VydCBJbmMxGTAXBgNVBAsTEHd3dy5kaWdpY2VydC5jb20xITAfBgNV +# BAMTGERpZ2lDZXJ0IFRydXN0ZWQgUm9vdCBHNDAeFw0yMjAzMjMwMDAwMDBaFw0z +# NzAzMjIyMzU5NTlaMGMxCzAJBgNVBAYTAlVTMRcwFQYDVQQKEw5EaWdpQ2VydCwg +# SW5jLjE7MDkGA1UEAxMyRGlnaUNlcnQgVHJ1c3RlZCBHNCBSU0E0MDk2IFNIQTI1 +# NiBUaW1lU3RhbXBpbmcgQ0EwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoIC +# AQDGhjUGSbPBPXJJUVXHJQPE8pE3qZdRodbSg9GeTKJtoLDMg/la9hGhRBVCX6SI +# 82j6ffOciQt/nR+eDzMfUBMLJnOWbfhXqAJ9/UO0hNoR8XOxs+4rgISKIhjf69o9 +# xBd/qxkrPkLcZ47qUT3w1lbU5ygt69OxtXXnHwZljZQp09nsad/ZkIdGAHvbREGJ +# 3HxqV3rwN3mfXazL6IRktFLydkf3YYMZ3V+0VAshaG43IbtArF+y3kp9zvU5Emfv +# DqVjbOSmxR3NNg1c1eYbqMFkdECnwHLFuk4fsbVYTXn+149zk6wsOeKlSNbwsDET +# qVcplicu9Yemj052FVUmcJgmf6AaRyBD40NjgHt1biclkJg6OBGz9vae5jtb7IHe +# IhTZgirHkr+g3uM+onP65x9abJTyUpURK1h0QCirc0PO30qhHGs4xSnzyqqWc0Jo +# n7ZGs506o9UD4L/wojzKQtwYSH8UNM/STKvvmz3+DrhkKvp1KCRB7UK/BZxmSVJQ +# 9FHzNklNiyDSLFc1eSuo80VgvCONWPfcYd6T/jnA+bIwpUzX6ZhKWD7TA4j+s4/T +# Xkt2ElGTyYwMO1uKIqjBJgj5FBASA31fI7tk42PgpuE+9sJ0sj8eCXbsq11GdeJg +# o1gJASgADoRU7s7pXcheMBK9Rp6103a50g5rmQzSM7TNsQIDAQABo4IBXTCCAVkw +# EgYDVR0TAQH/BAgwBgEB/wIBADAdBgNVHQ4EFgQUuhbZbU2FL3MpdpovdYxqII+e +# yG8wHwYDVR0jBBgwFoAU7NfjgtJxXWRM3y5nP+e6mK4cD08wDgYDVR0PAQH/BAQD +# AgGGMBMGA1UdJQQMMAoGCCsGAQUFBwMIMHcGCCsGAQUFBwEBBGswaTAkBggrBgEF +# BQcwAYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEEGCCsGAQUFBzAChjVodHRw +# Oi8vY2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRUcnVzdGVkUm9vdEc0LmNy +# dDBDBgNVHR8EPDA6MDigNqA0hjJodHRwOi8vY3JsMy5kaWdpY2VydC5jb20vRGln +# aUNlcnRUcnVzdGVkUm9vdEc0LmNybDAgBgNVHSAEGTAXMAgGBmeBDAEEAjALBglg +# hkgBhv1sBwEwDQYJKoZIhvcNAQELBQADggIBAH1ZjsCTtm+YqUQiAX5m1tghQuGw +# GC4QTRPPMFPOvxj7x1Bd4ksp+3CKDaopafxpwc8dB+k+YMjYC+VcW9dth/qEICU0 +# MWfNthKWb8RQTGIdDAiCqBa9qVbPFXONASIlzpVpP0d3+3J0FNf/q0+KLHqrhc1D +# X+1gtqpPkWaeLJ7giqzl/Yy8ZCaHbJK9nXzQcAp876i8dU+6WvepELJd6f8oVInw +# 1YpxdmXazPByoyP6wCeCRK6ZJxurJB4mwbfeKuv2nrF5mYGjVoarCkXJ38SNoOeY +# +/umnXKvxMfBwWpx2cYTgAnEtp/Nh4cku0+jSbl3ZpHxcpzpSwJSpzd+k1OsOx0I +# SQ+UzTl63f8lY5knLD0/a6fxZsNBzU+2QJshIUDQtxMkzdwdeDrknq3lNHGS1yZr +# 5Dhzq6YBT70/O3itTK37xJV77QpfMzmHQXh6OOmc4d0j/R0o08f56PGYX/sr2H7y +# Rp11LB4nLCbbbxV7HhmLNriT1ObyF5lZynDwN7+YAN8gFk8n+2BnFqFmut1VwDop +# hrCYoCvtlUG3OtUVmDG0YgkPCr2B2RP+v6TR81fZvAT6gt4y3wSJ8ADNXcL50CN/ +# AAvkdgIm2fBldkKmKYcJRyvmfxqkhQ/8mJb2VVQrH4D6wPIOK+XW+6kvRBVK5xMO +# Hds3OBqhK/bt1nz8MIIFjTCCBHWgAwIBAgIQDpsYjvnQLefv21DiCEAYWjANBgkq +# hkiG9w0BAQwFADBlMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5j +# MRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMSQwIgYDVQQDExtEaWdpQ2VydCBB +# c3N1cmVkIElEIFJvb3QgQ0EwHhcNMjIwODAxMDAwMDAwWhcNMzExMTA5MjM1OTU5 +# WjBiMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQL +# ExB3d3cuZGlnaWNlcnQuY29tMSEwHwYDVQQDExhEaWdpQ2VydCBUcnVzdGVkIFJv +# b3QgRzQwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQC/5pBzaN675F1K +# PDAiMGkz7MKnJS7JIT3yithZwuEppz1Yq3aaza57G4QNxDAf8xukOBbrVsaXbR2r +# snnyyhHS5F/WBTxSD1Ifxp4VpX6+n6lXFllVcq9ok3DCsrp1mWpzMpTREEQQLt+C +# 8weE5nQ7bXHiLQwb7iDVySAdYyktzuxeTsiT+CFhmzTrBcZe7FsavOvJz82sNEBf +# sXpm7nfISKhmV1efVFiODCu3T6cw2Vbuyntd463JT17lNecxy9qTXtyOj4DatpGY +# QJB5w3jHtrHEtWoYOAMQjdjUN6QuBX2I9YI+EJFwq1WCQTLX2wRzKm6RAXwhTNS8 +# rhsDdV14Ztk6MUSaM0C/CNdaSaTC5qmgZ92kJ7yhTzm1EVgX9yRcRo9k98FpiHaY +# dj1ZXUJ2h4mXaXpI8OCiEhtmmnTK3kse5w5jrubU75KSOp493ADkRSWJtppEGSt+ +# wJS00mFt6zPZxd9LBADMfRyVw4/3IbKyEbe7f/LVjHAsQWCqsWMYRJUadmJ+9oCw +# ++hkpjPRiQfhvbfmQ6QYuKZ3AeEPlAwhHbJUKSWJbOUOUlFHdL4mrLZBdd56rF+N +# P8m800ERElvlEFDrMcXKchYiCd98THU/Y+whX8QgUWtvsauGi0/C1kVfnSD8oR7F +# wI+isX4KJpn15GkvmB0t9dmpsh3lGwIDAQABo4IBOjCCATYwDwYDVR0TAQH/BAUw +# AwEB/zAdBgNVHQ4EFgQU7NfjgtJxXWRM3y5nP+e6mK4cD08wHwYDVR0jBBgwFoAU +# Reuir/SSy4IxLVGLp6chnfNtyA8wDgYDVR0PAQH/BAQDAgGGMHkGCCsGAQUFBwEB +# BG0wazAkBggrBgEFBQcwAYYYaHR0cDovL29jc3AuZGlnaWNlcnQuY29tMEMGCCsG +# AQUFBzAChjdodHRwOi8vY2FjZXJ0cy5kaWdpY2VydC5jb20vRGlnaUNlcnRBc3N1 +# cmVkSURSb290Q0EuY3J0MEUGA1UdHwQ+MDwwOqA4oDaGNGh0dHA6Ly9jcmwzLmRp +# Z2ljZXJ0LmNvbS9EaWdpQ2VydEFzc3VyZWRJRFJvb3RDQS5jcmwwEQYDVR0gBAow +# CDAGBgRVHSAAMA0GCSqGSIb3DQEBDAUAA4IBAQBwoL9DXFXnOF+go3QbPbYW1/e/ +# Vwe9mqyhhyzshV6pGrsi+IcaaVQi7aSId229GhT0E0p6Ly23OO/0/4C5+KH38nLe +# JLxSA8hO0Cre+i1Wz/n096wwepqLsl7Uz9FDRJtDIeuWcqFItJnLnU+nBgMTdydE +# 1Od/6Fmo8L8vC6bp8jQ87PcDx4eo0kxAGTVGamlUsLihVo7spNU96LHc/RzY9Hda +# XFSMb++hUD38dglohJ9vytsgjTVgHAIDyyCwrFigDkBjxZgiwbJZ9VVrzyerbHbO +# byMt9H5xaiNrIv8SuFQtJ37YOtnwtoeW/VvRXKwYw02fc7cBqZ9Xql4o4rmUMYID +# djCCA3ICAQEwdzBjMQswCQYDVQQGEwJVUzEXMBUGA1UEChMORGlnaUNlcnQsIElu +# Yy4xOzA5BgNVBAMTMkRpZ2lDZXJ0IFRydXN0ZWQgRzQgUlNBNDA5NiBTSEEyNTYg +# VGltZVN0YW1waW5nIENBAhAFRK/zlJ0IOaa/2z9f5WEWMA0GCWCGSAFlAwQCAQUA +# oIHRMBoGCSqGSIb3DQEJAzENBgsqhkiG9w0BCRABBDAcBgkqhkiG9w0BCQUxDxcN +# MjQwNjA2MTk1MDE0WjArBgsqhkiG9w0BCRACDDEcMBowGDAWBBRm8CsywsLJD4Jd +# zqqKycZPGZzPQDAvBgkqhkiG9w0BCQQxIgQgUvswt0fWRoofHUAuTE0/8V9tLmHP +# zr/l2RTobZjBdqYwNwYLKoZIhvcNAQkQAi8xKDAmMCQwIgQg0vbkbe10IszR1EBX +# aEE2b4KK2lWarjMWr00amtQMeCgwDQYJKoZIhvcNAQEBBQAEggIAc7/uG/S8kf0i +# 2kaDQkE8NSfiXCYfN7z/2sgi6RNrkipvs/KTWfEKuMbhu9qWjjusZFgywn/IrZqw +# td4Js1kmaN+HJ02t/HXYUCr+KTJye4mDaBGvaXXHllCqsK7bhsJxJYE0uYiL03MP +# g64jyu9WdJD3N26MW/DkO6HTVhYzRzjafbAKbrr8KCvaFan1KZERzYwbA8XVjm88 +# HOodLCA9h+91Iqdc+uSz3Sg9/+Ns4zCp4BonvnsPYTlWTitiB5cpfPe/v4lBvCNu +# x0ha6whvKMdRLZJgXsiDXo2NwwB55kkWEBwD3a1RnBJQmyJxFEGpSXOrhmdcEWPg +# fjoHVIfowKBrIgINdWJbvIu+pLzQRMkVhuJzB32xpiZBIvbzkPETYQMOmKIu40I9 +# 5EAL0xNakPxYiT3nTkncn6woLOhiOXFm7crE+gO4IzDNauYuT9Vfe36K1CqtuYSy +# JesLIey9Z81OQqOo6n2/lW110MKMEV2PkPU7YW/bYO2uKsZ3OAjUWr63nMT+M2wk +# VdUAcqm0QdZsELY75Q3ekRxHje/B9ePP4Q4RMQGOZvmgqdtEeFhsmRwufR4fzfqx +# WMttmOHelTd8Sc0sfA9B+1dxtiC9GFn3de5/o+T2s/jQn6eNp2hvlCqGV0iFzSQp +# InPTBa9Na/+5UeXZ3NBWRvarfZ62TVM= +# SIG # End signature block diff --git a/.venv/Scripts/activate b/.venv/Scripts/activate new file mode 100644 index 0000000..cffeeaa --- /dev/null +++ b/.venv/Scripts/activate @@ -0,0 +1,70 @@ +# This file must be used with "source bin/activate" *from bash* +# You cannot run it directly + +deactivate () { + # reset old environment variables + if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then + PATH="${_OLD_VIRTUAL_PATH:-}" + export PATH + unset _OLD_VIRTUAL_PATH + fi + if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then + PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}" + export PYTHONHOME + unset _OLD_VIRTUAL_PYTHONHOME + fi + + # Call hash to forget past commands. Without forgetting + # past commands the $PATH changes we made may not be respected + hash -r 2> /dev/null + + if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then + PS1="${_OLD_VIRTUAL_PS1:-}" + export PS1 + unset _OLD_VIRTUAL_PS1 + fi + + unset VIRTUAL_ENV + unset VIRTUAL_ENV_PROMPT + if [ ! "${1:-}" = "nondestructive" ] ; then + # Self destruct! + unset -f deactivate + fi +} + +# unset irrelevant variables +deactivate nondestructive + +# on Windows, a path can contain colons and backslashes and has to be converted: +if [ "${OSTYPE:-}" = "cygwin" ] || [ "${OSTYPE:-}" = "msys" ] ; then + # transform D:\path\to\venv to /d/path/to/venv on MSYS + # and to /cygdrive/d/path/to/venv on Cygwin + export VIRTUAL_ENV=$(cygpath "c:\Users\valen\OneDrive\Documents\Jordy Projects\AlgorithmAudit\python_synthpop\.venv") +else + # use the path as-is + export VIRTUAL_ENV="c:\Users\valen\OneDrive\Documents\Jordy Projects\AlgorithmAudit\python_synthpop\.venv" +fi + +_OLD_VIRTUAL_PATH="$PATH" +PATH="$VIRTUAL_ENV/Scripts:$PATH" +export PATH + +# unset PYTHONHOME if set +# this will fail if PYTHONHOME is set to the empty string (which is bad anyway) +# could use `if (set -u; : $PYTHONHOME) ;` in bash +if [ -n "${PYTHONHOME:-}" ] ; then + _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}" + unset PYTHONHOME +fi + +if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then + _OLD_VIRTUAL_PS1="${PS1:-}" + PS1="(.venv) ${PS1:-}" + export PS1 + VIRTUAL_ENV_PROMPT="(.venv) " + export VIRTUAL_ENV_PROMPT +fi + +# Call hash to forget past commands. Without forgetting +# past commands the $PATH changes we made may not be respected +hash -r 2> /dev/null diff --git a/.venv/Scripts/activate.bat b/.venv/Scripts/activate.bat new file mode 100644 index 0000000..2d6b787 --- /dev/null +++ b/.venv/Scripts/activate.bat @@ -0,0 +1,34 @@ +@echo off + +rem This file is UTF-8 encoded, so we need to update the current code page while executing it +for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do ( + set _OLD_CODEPAGE=%%a +) +if defined _OLD_CODEPAGE ( + "%SystemRoot%\System32\chcp.com" 65001 > nul +) + +set VIRTUAL_ENV=c:\Users\valen\OneDrive\Documents\Jordy Projects\AlgorithmAudit\python_synthpop\.venv + +if not defined PROMPT set PROMPT=$P$G + +if defined _OLD_VIRTUAL_PROMPT set PROMPT=%_OLD_VIRTUAL_PROMPT% +if defined _OLD_VIRTUAL_PYTHONHOME set PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME% + +set _OLD_VIRTUAL_PROMPT=%PROMPT% +set PROMPT=(.venv) %PROMPT% + +if defined PYTHONHOME set _OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME% +set PYTHONHOME= + +if defined _OLD_VIRTUAL_PATH set PATH=%_OLD_VIRTUAL_PATH% +if not defined _OLD_VIRTUAL_PATH set _OLD_VIRTUAL_PATH=%PATH% + +set PATH=%VIRTUAL_ENV%\Scripts;%PATH% +set VIRTUAL_ENV_PROMPT=(.venv) + +:END +if defined _OLD_CODEPAGE ( + "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul + set _OLD_CODEPAGE= +) diff --git a/.venv/Scripts/deactivate.bat b/.venv/Scripts/deactivate.bat new file mode 100644 index 0000000..62a39a7 --- /dev/null +++ b/.venv/Scripts/deactivate.bat @@ -0,0 +1,22 @@ +@echo off + +if defined _OLD_VIRTUAL_PROMPT ( + set "PROMPT=%_OLD_VIRTUAL_PROMPT%" +) +set _OLD_VIRTUAL_PROMPT= + +if defined _OLD_VIRTUAL_PYTHONHOME ( + set "PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%" + set _OLD_VIRTUAL_PYTHONHOME= +) + +if defined _OLD_VIRTUAL_PATH ( + set "PATH=%_OLD_VIRTUAL_PATH%" +) + +set _OLD_VIRTUAL_PATH= + +set VIRTUAL_ENV= +set VIRTUAL_ENV_PROMPT= + +:END diff --git a/.venv/Scripts/f2py.exe b/.venv/Scripts/f2py.exe new file mode 100644 index 0000000..dd6da59 Binary files /dev/null and b/.venv/Scripts/f2py.exe differ diff --git a/.venv/Scripts/numpy-config.exe b/.venv/Scripts/numpy-config.exe new file mode 100644 index 0000000..69d7106 Binary files /dev/null and b/.venv/Scripts/numpy-config.exe differ diff --git a/.venv/Scripts/pip.exe b/.venv/Scripts/pip.exe new file mode 100644 index 0000000..d2007fb Binary files /dev/null and b/.venv/Scripts/pip.exe differ diff --git a/.venv/Scripts/pip3.12.exe b/.venv/Scripts/pip3.12.exe new file mode 100644 index 0000000..d2007fb Binary files /dev/null and b/.venv/Scripts/pip3.12.exe differ diff --git a/.venv/Scripts/pip3.exe b/.venv/Scripts/pip3.exe new file mode 100644 index 0000000..d2007fb Binary files /dev/null and b/.venv/Scripts/pip3.exe differ diff --git a/.venv/Scripts/py.test.exe b/.venv/Scripts/py.test.exe new file mode 100644 index 0000000..296a973 Binary files /dev/null and b/.venv/Scripts/py.test.exe differ diff --git a/.venv/Scripts/pytest.exe b/.venv/Scripts/pytest.exe new file mode 100644 index 0000000..296a973 Binary files /dev/null and b/.venv/Scripts/pytest.exe differ diff --git a/.venv/Scripts/python.exe b/.venv/Scripts/python.exe new file mode 100644 index 0000000..53121ae Binary files /dev/null and b/.venv/Scripts/python.exe differ diff --git a/.venv/Scripts/pythonw.exe b/.venv/Scripts/pythonw.exe new file mode 100644 index 0000000..a09f6e9 Binary files /dev/null and b/.venv/Scripts/pythonw.exe differ diff --git a/example_notebooks/01_missing_data_handler_example.ipynb b/example_notebooks/01_missing_data_handler_example.ipynb new file mode 100644 index 0000000..a27c58f --- /dev/null +++ b/example_notebooks/01_missing_data_handler_example.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import timedelta\n", + "from synthpop import MissingDataHandler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dates = pd.date_range(\"2023-01-01\", periods=50, freq=\"D\")\n", + "bool_values = np.random.choice([True, False], size=50)\n", + "timedeltas = timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]\n", + "df_custom = pd.DataFrame({\n", + " \"numeric_col1\": np.random.normal(50, 10, 50),\n", + " \"numeric_col2\": np.random.randint(0, 100, 50),\n", + " \"categorical_col\": np.random.choice([\"Red\", \"Green\", \"Blue\"], size=50),\n", + " \"boolean_col\": bool_values,\n", + " \"datetime_col\": dates,\n", + " \"timedelta_col\": timedeltas,\n", + " \"float_col\": np.random.uniform(0.0, 1.0, 50)\n", + "})\n", + "\n", + "\n", + "df = df_custom.copy()\n", + "df.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(42) # For reproducibility\n", + "\n", + "def introduce_missingness(dataframe, missing_frac=0.1):\n", + " \"\"\"Randomly set a fraction of each column's values to NaN.\"\"\"\n", + " df_with_nans = dataframe.copy()\n", + " rows = len(df_with_nans)\n", + " for col in df_with_nans.columns:\n", + " n_missing = int(rows * missing_frac)\n", + " missing_indices = np.random.choice(df_with_nans.index, n_missing, replace=False)\n", + " df_with_nans.loc[missing_indices, col] = np.nan\n", + " return df_with_nans\n", + "\n", + "df_missing = introduce_missingness(df, missing_frac=0.2) # 20% missingness\n", + "df_missing.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "md_handler = MissingDataHandler()\n", + "\n", + "# Check the data types\n", + "column_dtypes = md_handler.get_column_dtypes(df_missing)\n", + "print(\"Column Data Types:\", column_dtypes)\n", + "\n", + "# Detect missingness\n", + "missingness_dict = md_handler.detect_missingness(df_missing)\n", + "print(\"Detected Missingness Type:\", missingness_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_imputed = md_handler.apply_imputation(df_missing, missingness_dict)\n", + "\n", + "print(\"Before Imputation:\\n\", df_missing.head(10))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nAfter Imputation:\\n\", df_imputed.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AAdev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_notebooks/02_data_processor_example.ipynb b/example_notebooks/02_data_processor_example.ipynb new file mode 100644 index 0000000..609ab40 --- /dev/null +++ b/example_notebooks/02_data_processor_example.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import timedelta\n", + "from synthpop import DataProcessor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample data generation using the provided data\n", + "dates = pd.date_range(\"2023-01-01\", periods=50, freq=\"D\")\n", + "bool_values = np.random.choice([True, False], size=50)\n", + "timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 50)]\n", + "\n", + "df_custom = pd.DataFrame({\n", + " \"numeric_col1\": np.random.normal(50, 10, 50),\n", + " \"numeric_col2\": np.random.randint(0, 100, 50),\n", + " \"categorical_col\": np.random.choice([\"Red\", \"Green\", \"Blue\"], size=50),\n", + " \"boolean_col\": bool_values,\n", + " \"datetime_col\": dates,\n", + " \"timedelta_col\": timedeltas,\n", + " \"float_col\": np.random.uniform(0.0, 1.0, 50)\n", + "})\n", + "\n", + "df = df_custom.copy()\n", + "print(\"Original Data:\")\n", + "display(df.head())\n", + "\n", + "# Define metadata for each column (update these types as needed)\n", + "metadata = {\n", + " \"numeric_col1\": \"numerical\",\n", + " \"numeric_col2\": \"numerical\",\n", + " \"categorical_col\": \"categorical\",\n", + " \"boolean_col\": \"boolean\",\n", + " \"datetime_col\": \"datetime\",\n", + " \"timedelta_col\": \"timedelta\",\n", + " \"float_col\": \"numerical\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate the DataProcessor with the metadata\n", + "processor = DataProcessor(metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocess the data: transforms raw data into a numerical format\n", + "processed_data = processor.preprocess(df)\n", + "print(\"Processed Data:\")\n", + "display(processed_data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Simulate synthetic data generation by copying the processed data\n", + "# (Replace this step with your synthetic data generation method if available)\n", + "synthetic_data = processed_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Postprocess the synthetic data to revert it back to its original data types\n", + "#the post processing makes sure to have the columns in their original order. \n", + "recovered_data = processor.postprocess(synthetic_data)\n", + "print(\"Recovered Data:\")\n", + "display(recovered_data.head())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AAdev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_notebooks/03_gaussian_copula.ipynb b/example_notebooks/03_gaussian_copula.ipynb new file mode 100644 index 0000000..49bcc03 --- /dev/null +++ b/example_notebooks/03_gaussian_copula.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import timedelta\n", + "from synthpop import DataProcessor, GaussianCopulaMethod, MissingDataHandler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample Data Generation with 100 observations\n", + "dates = pd.date_range(\"2023-01-01\", periods=100, freq=\"D\")\n", + "bool_values = np.random.choice([True, False], size=100)\n", + "timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 100)]\n", + "\n", + "df_custom = pd.DataFrame({\n", + " \"numeric_col1\": np.random.normal(50, 10, 100),\n", + " \"numeric_col2\": np.random.randint(0, 100, 100),\n", + " \"categorical_col\": np.random.choice([\"Red\", \"Green\", \"Blue\"], size=100),\n", + " \"boolean_col\": bool_values,\n", + " \"datetime_col\": dates,\n", + " \"timedelta_col\": timedeltas,\n", + " \"float_col\": np.random.uniform(0.0, 1.0, 100)\n", + "})\n", + "\n", + "df = df_custom.copy()\n", + "print(\"Original Data:\")\n", + "display(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#get the metadata from df \n", + "metadata = MissingDataHandler.get_column_dtypes(df)\n", + "print(metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate DataProcessor and preprocess the data\n", + "processor = DataProcessor(metadata)\n", + "processed_data = processor.preprocess(df)\n", + "print(\"Processed Data:\")\n", + "display(processed_data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate GaussianCopulaMethod with the same metadata\n", + "gaussian_copula = GaussianCopulaMethod(metadata)\n", + "# Fit the Gaussian Copula model on the processed data\n", + "gaussian_copula.fit(processed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate synthetic processed data (e.g., 100 synthetic observations)\n", + "synthetic_processed = gaussian_copula.sample(100)\n", + "print(\"Synthetic Processed Data:\")\n", + "display(synthetic_processed.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Postprocess the synthetic data back to the original format\n", + "synthetic_data = processor.postprocess(synthetic_processed)\n", + "print(\"Synthetic Data in Original Format:\")\n", + "display(synthetic_data.head())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AAdev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_notebooks/04_cart_method.ipynb b/example_notebooks/04_cart_method.ipynb new file mode 100644 index 0000000..5896447 --- /dev/null +++ b/example_notebooks/04_cart_method.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import timedelta\n", + "from synthpop import DataProcessor, CARTMethod, MissingDataHandler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample Data Generation with 100 observations\n", + "dates = pd.date_range(\"2023-01-01\", periods=100, freq=\"D\")\n", + "bool_values = np.random.choice([True, False], size=100)\n", + "timedeltas = [timedelta(days=int(i)) for i in np.random.randint(1, 100, 100)]\n", + "\n", + "df_custom = pd.DataFrame({\n", + " \"numeric_col1\": np.random.normal(50, 10, 100),\n", + " \"numeric_col2\": np.random.randint(0, 100, 100),\n", + " \"categorical_col\": np.random.choice([\"Red\", \"Green\", \"Blue\"], size=100),\n", + " \"boolean_col\": bool_values,\n", + " \"datetime_col\": dates,\n", + " \"timedelta_col\": timedeltas,\n", + " \"float_col\": np.random.uniform(0.0, 1.0, 100)\n", + "})\n", + "\n", + "df = df_custom.copy()\n", + "print(\"Original Data:\")\n", + "display(df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#get the metadata from df \n", + "metadata = MissingDataHandler.get_column_dtypes(df)\n", + "print(metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate DataProcessor and preprocess the data\n", + "processor = DataProcessor(metadata)\n", + "processed_data = processor.preprocess(df)\n", + "print(\"Processed Data:\")\n", + "display(processed_data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate and fit the CART method\n", + "cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)\n", + "cart.fit(processed_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # For prediction, we might use the same data (or new preprocessed data)\n", + "synthetic_processed = cart.sample(100)\n", + "print(\"Synthetic Processed Data:\")\n", + "display(synthetic_processed.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Postprocess the synthetic data back to the original format\n", + "synthetic_data = processor.postprocess(synthetic_processed)\n", + "print(\"Synthetic Data in Original Format:\")\n", + "display(synthetic_data.head())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AAdev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example_notebooks/05_metrics.ipynb b/example_notebooks/05_metrics.ipynb new file mode 100644 index 0000000..923234c --- /dev/null +++ b/example_notebooks/05_metrics.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from synthpop.metrics import (\n", + " MetricsReport,\n", + " EfficacyMetrics,\n", + " DisclosureProtection\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a small real DataFrame\n", + "real_df = pd.DataFrame({\n", + " \"numeric_col\": [1, 2, 3, 4, 5, np.nan],\n", + " \"categorical_col\": [\"a\", \"b\", \"a\", \"c\", \"b\", \"b\"],\n", + " \"datetime_col\": pd.date_range(\"2023-01-01\", periods=6),\n", + " \"boolean_col\": [True, False, True, False, True, False]\n", + "})\n", + "\n", + "# Create a corresponding synthetic DataFrame\n", + "#can come from one of the SDG methods \n", + "synthetic_df = pd.DataFrame({\n", + " \"numeric_col\": [1.1, 2.1, 2.9, 3.8, 5.2, np.nan],\n", + " \"categorical_col\": [\"a\", \"b\", \"b\", \"c\", \"d\", \"b\"],\n", + " \"datetime_col\": pd.date_range(\"2023-01-01\", periods=6),\n", + " \"boolean_col\": [True, True, True, False, True, False]\n", + "})\n", + "\n", + "# Optional metadata\n", + "#can be obtained also by the missing data handler \n", + "metadata = {\n", + " \"numeric_col\": \"numerical\",\n", + " \"categorical_col\": \"categorical\",\n", + " \"datetime_col\": \"datetime\",\n", + " \"boolean_col\": \"boolean\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate and generate a diagnostic report\n", + "report = MetricsReport(real_df, synthetic_df, metadata)\n", + "report_df = report.generate_report()\n", + "print(\"=== Diagnostic Report ===\")\n", + "display(report_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. DEMO: EfficacyMetrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# EXAMPLE A: Regression\n", + "real_reg = pd.DataFrame({\n", + " \"feat1\": np.random.normal(0, 1, 100),\n", + " \"feat2\": np.random.normal(5, 2, 100),\n", + " \"target\": np.random.normal(10, 3, 100)\n", + "})\n", + "synthetic_reg = pd.DataFrame({\n", + " \"feat1\": np.random.normal(0, 1, 100),\n", + " \"feat2\": np.random.normal(5, 2, 100),\n", + " \"target\": np.random.normal(10, 3, 100)\n", + "})\n", + "\n", + "reg_efficacy = EfficacyMetrics(task='regression', target_column=\"target\")\n", + "reg_metrics = reg_efficacy.evaluate(real_reg, synthetic_reg)\n", + "print(\"=== Regression Efficacy Metrics ===\")\n", + "print(reg_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# EXAMPLE B: Classification\n", + "real_clf = pd.DataFrame({\n", + " \"feat1\": np.random.normal(0, 1, 100),\n", + " \"feat2\": np.random.normal(5, 2, 100),\n", + " \"target\": np.random.choice([\"A\", \"B\"], size=100)\n", + "})\n", + "synthetic_clf = pd.DataFrame({\n", + " \"feat1\": np.random.normal(0, 1, 100),\n", + " \"feat2\": np.random.normal(5, 2, 100),\n", + " \"target\": np.random.choice([\"A\", \"B\"], size=100)\n", + "})\n", + "\n", + "clf_efficacy = EfficacyMetrics(task='classification', target_column=\"target\")\n", + "clf_metrics = clf_efficacy.evaluate(real_clf, synthetic_clf)\n", + "print(\"\\n=== Classification Efficacy Metrics ===\")\n", + "print(clf_metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. DEMO: Privacy metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example numeric real vs. synthetic data\n", + "real_privacy = pd.DataFrame({\n", + " \"col1\": np.random.normal(0, 1, 100),\n", + " \"col2\": np.random.normal(5, 2, 100)\n", + "})\n", + "synthetic_privacy = real_privacy + np.random.normal(0, 0.3, real_privacy.shape)\n", + "\n", + "dp = DisclosureProtection(real_privacy, synthetic_privacy)\n", + "dp_score = dp.score()\n", + "dp_report = dp.report()\n", + "\n", + "print(\"\\n=== Disclosure Protection ===\")\n", + "print(f\"Score: {dp_score:.3f}\")\n", + "print(\"Detailed Report:\", dp_report)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AAdev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 8ab67d4..446d46d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ dependencies = [ "numpy>=1.20.0", "pandas>=1.3.0", "scikit-learn>=1.0.0", + "copulas>=0.1.0", + ] readme = "README.md" license = {file = "LICENSE"} diff --git a/requirements.txt b/requirements.txt index 3c8fd34..8fc59a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ numpy>=1.20.0 pandas>=1.3.0 scikit-learn>=1.0.0 -pytest>=7.0.0 \ No newline at end of file +pytest>=7.0.0 +copulas>=0.1.0 diff --git a/setup.py b/setup.py index 76a7d88..9b3ab5d 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ "numpy>=1.20.0", "pandas>=1.3.0", "scikit-learn>=1.0.0", + "copulas>=0.1.0", ], extras_require={ "dev": [ diff --git a/synthpop/__init__.py b/synthpop/__init__.py index 8e76743..69edef5 100644 --- a/synthpop/__init__.py +++ b/synthpop/__init__.py @@ -1,5 +1,25 @@ -NUM_COLS_DTYPES = ['int', 'float', 'datetime'] -CAT_COLS_DTYPES = ['category', 'bool'] +from .method import CARTMethod, GaussianCopulaMethod, proper, smooth +from .processor import DataProcessor, MissingDataHandler +from .validator import Validator +from .constants import NUM_COLS_DTYPES, CAT_COLS_DTYPES +from .metrics import MetricsReport, EfficacyMetrics, DisclosureProtection +# from .metrics import , compute_TSComplement # if needed + + +__all__ = [ + "CARTMethod", + "GaussianCopulaMethod", + "proper", + "smooth", + "DataProcessor", + "MissingDataHandler", + "Validator", + "MetricsReport", + "EfficacyMetrics", + "DisclosureProtection", + "NUM_COLS_DTYPES", + "CAT_COLS_DTYPES", + # "compute_TSComplement", +] + -from synthpop.synthpop import Synthpop -print('yes') \ No newline at end of file diff --git a/synthpop/constants.py b/synthpop/constants.py new file mode 100644 index 0000000..bfcb761 --- /dev/null +++ b/synthpop/constants.py @@ -0,0 +1,2 @@ +NUM_COLS_DTYPES = ['int', 'float', 'datetime'] +CAT_COLS_DTYPES = ['category', 'bool'] \ No newline at end of file diff --git a/synthpop/method/GC.py b/synthpop/method/GC.py new file mode 100644 index 0000000..5f902a4 --- /dev/null +++ b/synthpop/method/GC.py @@ -0,0 +1,368 @@ +import inspect +import logging +import warnings +from copy import deepcopy +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import pandas as pd +import scipy +import copulas.univariate +from copulas import multivariate +from sklearn.preprocessing import OneHotEncoder +from synthpop.method.helpers import ( + validate_numerical_distributions, + warn_missing_numerical_distributions, + flatten_dict, + unflatten_dict, +) + +LOGGER = logging.getLogger(__name__) + +class BaseSingleTableSynthesizer: + """ + Base class for single table synthesizers. + + Args: + metadata (dict): Dictionary mapping column names to their types. + enforce_min_max_values (bool): Whether to clip reversed numerical values to the observed min/max. Defaults to True. + enforce_rounding (bool): Whether to round numerical columns during reverse transformation. Defaults to True. + locales (Union[List[str], str]): Default locale(s) to use. Defaults to "en_US". + """ + def __init__( + self, + metadata: Dict[str, str], + enforce_min_max_values: bool = True, + enforce_rounding: bool = True, + locales: Union[List[str], str] = "en_US", + ) -> None: + self.metadata = metadata + self.enforce_min_max_values = enforce_min_max_values + self.enforce_rounding = enforce_rounding + if isinstance(locales, str): + self.locales = [locales] + else: + self.locales = locales + + +class GaussianCopulaMethod(BaseSingleTableSynthesizer): + # Mapping of distribution name (lowercase) to copulas univariate classes. + _DISTRIBUTIONS: Dict[str, Any] = { + "norm": copulas.univariate.GaussianUnivariate, + "beta": copulas.univariate.BetaUnivariate, + "truncnorm": copulas.univariate.TruncatedGaussian, + "gamma": copulas.univariate.GammaUnivariate, + "uniform": copulas.univariate.UniformUnivariate, + "gaussian_kde": copulas.univariate.GaussianKDE, + } + # Maximum iterations for correlation matrix adjustment + _MAX_CORR_ITERATIONS: int = 10 + + @classmethod + def get_distribution_class(cls, distribution: str) -> Any: + """ + Return the corresponding distribution class from copulas.univariate. + + Args: + distribution (str): A string representing a copulas univariate distribution. + + Returns: + The corresponding copulas univariate class. + """ + if not isinstance(distribution, str): + raise ValueError(f"Distribution specification must be a string, got {type(distribution)}") + # Allow case-insensitive matching. + distribution_key = distribution.lower() + if distribution_key not in cls._DISTRIBUTIONS: + error_message = ( + f"Invalid distribution specification '{distribution}'. " + f"Valid options: {list(cls._DISTRIBUTIONS.keys())}" + ) + raise ValueError(error_message) + return cls._DISTRIBUTIONS[distribution_key] + + def __init__( + self, + metadata: Dict[str, str], + enforce_min_max_values: bool = True, + enforce_rounding: bool = True, + locales: Union[List[str], str] = "en_US", + numerical_distributions: Optional[Dict[str, str]] = None, + default_distribution: Optional[str] = None, + ) -> None: + super().__init__(metadata, enforce_min_max_values, enforce_rounding, locales) + # Validate numerical distributions using metadata keys. + validate_numerical_distributions(numerical_distributions, list(self.metadata.keys())) + self.default_distribution: str = default_distribution or "beta" + self._default_distribution = self.get_distribution_class(self.default_distribution) + self._set_numerical_distributions(numerical_distributions) + self._num_rows: Optional[int] = None + self._model: Optional[Any] = None + self._fitted: bool = False + + def _set_numerical_distributions(self, numerical_distributions: Optional[Dict[str, str]]) -> None: + """ + Sets the numerical distributions to be used during model initialization. + """ + self.numerical_distributions = numerical_distributions or {} + self._numerical_distributions = { + field: self.get_distribution_class(distribution) + for field, distribution in self.numerical_distributions.items() + } + + def _learn_num_rows(self, processed_data: pd.DataFrame) -> int: + """ + Learn the number of rows from the processed data. + """ + return len(processed_data) + + def _get_numerical_distributions(self, processed_data: pd.DataFrame) -> Dict[str, Any]: + """ + Get a complete dictionary of numerical distributions for all columns in the data. + """ + numerical_distributions = deepcopy(self._numerical_distributions) + for column in processed_data.columns: + if column not in numerical_distributions: + numerical_distributions[column] = self._default_distribution + return numerical_distributions + + def _initialize_model(self, numerical_distributions: Dict[str, Any]) -> Any: + """ + Initialize the GaussianMultivariate model with the given numerical distributions. + """ + return multivariate.GaussianMultivariate(distribution=numerical_distributions) + + def _fit_model(self, processed_data: pd.DataFrame) -> None: + """ + Fit the GaussianMultivariate model on the processed data. + """ + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", module="scipy") + self._model.fit(processed_data) + + + def fit(self, processed_data: pd.DataFrame) -> None: + """ + Public API method to fit the Gaussian Copula model on processed data. + + Args: + processed_data (pd.DataFrame): Data that has been preprocessed. + """ + warn_missing_numerical_distributions(self.numerical_distributions, list(processed_data.columns)) + self._num_rows = self._learn_num_rows(processed_data) + numerical_distributions = self._get_numerical_distributions(processed_data) + self._model = self._initialize_model(numerical_distributions) + self._fit_model(processed_data) + self._fitted = True + + def sample(self, num_rows: int, conditions: Optional[Dict[str, Any]] = None) -> pd.DataFrame: + """ + Public API method to sample synthetic data from the fitted model. + + Args: + num_rows (int): Number of rows to sample. + conditions (Optional[Dict[str, Any]]): Optional conditions for sampling. + + Returns: + pd.DataFrame: A DataFrame containing the synthetic samples. + """ + if not self._fitted or self._model is None: + raise ValueError("Model is not fitted yet. Please call fit() before sampling.") + return self._model.sample(num_rows, conditions=conditions) + + def get_learned_distributions(self) -> Dict[str, Any]: + """ + Get the marginal distributions used by the Gaussian Copula. + + Returns: + Dict[str, Any]: A dictionary mapping column names to the distribution name and learned parameters. + + Raises: + ValueError: If the model has not been fitted. + """ + if not self._fitted or self._model is None: + raise ValueError("Distributions have not been learned yet. Please fit your model first using 'fit()'.") + if not hasattr(self._model, "to_dict") or not self._model.to_dict(): + return {} + parameters = self._model.to_dict() + columns = parameters.get("columns", []) + univariates = deepcopy(parameters.get("univariates", [])) + learned_distributions: Dict[str, Any] = {} + valid_columns = self._get_valid_columns_from_metadata(columns) + for column, learned_params in zip(columns, univariates): + if column in valid_columns: + distribution = self.numerical_distributions.get(column, self.default_distribution) + learned_params.pop("type", None) + learned_distributions[column] = { + "distribution": distribution, + "learned_parameters": learned_params, + } + return learned_distributions + + def _get_valid_columns_from_metadata(self, columns: List[str]) -> List[str]: + """ + Extract valid columns based on the metadata. + + Args: + columns (List[str]): List of column names. + + Returns: + List[str]: Valid column names found in metadata. + """ + valid_columns: List[str] = [] + for column in columns: + for valid_column in self.metadata.keys(): + if column.startswith(valid_column): + valid_columns.append(column) + break + return valid_columns + + def _get_parameters(self) -> Dict[str, Any]: + """ + Get the parameters of the copula model. + + Returns: + Dict[str, Any]: A flattened dictionary containing copula parameters. + """ + # Ensure univariates are in their base instance form if applicable. + for univariate in self._model.univariates: + if isinstance(univariate, copulas.univariate.Univariate): + univariate = univariate._instance + params = self._model.to_dict() + correlation = [] + for index, row in enumerate(params.get("correlation", [])[1:]): + correlation.append(row[: index + 1]) + params["correlation"] = correlation + params["univariates"] = dict(zip(params.get("columns", []), params.get("univariates", []))) + params["num_rows"] = self._num_rows + return flatten_dict(params) + + @classmethod + def _get_nearest_correlation_matrix(cls, matrix: np.ndarray) -> np.ndarray: + """ + Find the nearest Positive Semi-definite (PSD) correlation matrix. + Iteratively adjust negative eigenvalues up to a maximum number of iterations. + + Args: + matrix (np.ndarray): Input correlation matrix. + + Returns: + np.ndarray: Adjusted correlation matrix that is PSD and has ones on the diagonal. + """ + eigenvalues, eigenvectors = scipy.linalg.eigh(matrix) + iterations = 0 + identity = np.identity(len(matrix)) + while np.any(eigenvalues < 0) and iterations < cls._MAX_CORR_ITERATIONS: + # Set negative eigenvalues to zero. + eigenvalues[eigenvalues < 0] = 0 + matrix = eigenvectors @ np.diag(eigenvalues) @ eigenvectors.T + # Force ones on the diagonal. + matrix = matrix - np.diag(np.diag(matrix)) + np.identity(len(matrix)) + max_value = np.abs(matrix).max() + if max_value > 1: + matrix /= max_value + eigenvalues, eigenvectors = scipy.linalg.eigh(matrix) + iterations += 1 + if iterations >= cls._MAX_CORR_ITERATIONS and np.any(eigenvalues < 0): + LOGGER.warning("Correlation matrix did not converge to PSD within maximum iterations.") + return matrix + + def _set_parameters(self, parameters: Dict[str, Any], default_params: Optional[Dict[str, Any]] = None) -> None: + """ + Set copula model parameters based on a flattened parameter dictionary. + + Args: + parameters (Dict[str, Any]): Flattened dictionary of model parameters. + default_params (Optional[Dict[str, Any]]): Default parameters to fall back on if provided. + """ + if default_params is not None: + default_params = unflatten_dict(default_params) + else: + default_params = {} + parameters = unflatten_dict(parameters) + if "num_rows" in parameters: + num_rows = parameters.pop("num_rows") + self._num_rows = 0 if pd.isna(num_rows) else max(0, int(round(num_rows))) + if parameters: + parameters = self._rebuild_gaussian_copula(parameters, default_params) + self._model = multivariate.GaussianMultivariate.from_dict(parameters) + self._fitted = True + + def _rebuild_gaussian_copula(self, model_parameters: Dict[str, Any], default_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ + Rebuild the model parameters to recreate a Gaussian Multivariate instance. + + Args: + model_parameters (Dict[str, Any]): Restructured model parameters. + default_params (Optional[Dict[str, Any]]): Fallback parameters if sampled parameters are invalid. + + Returns: + Dict[str, Any]: Model parameters ready for GaussianMultivariate instantiation. + """ + if default_params is None: + default_params = {} + columns: List[str] = [] + univariates: List[Dict[str, Any]] = [] + for column, univariate in model_parameters.get("univariates", {}).items(): + columns.append(column) + if column in self._numerical_distributions: + univariate_type = self._numerical_distributions[column] + else: + univariate_type = self.get_distribution_class(self.default_distribution) + univariate["type"] = univariate_type + model = getattr(univariate_type, "MODEL_CLASS", None) + if model and hasattr(model, "_argcheck"): + try: + # Extract the parameters required for _argcheck. + arg_names = list(inspect.signature(model._argcheck).parameters.keys()) + to_check = {parameter: univariate[parameter] for parameter in arg_names if parameter in univariate} + if not model._argcheck(**to_check): + if "univariates" in default_params and column in default_params["univariates"]: + LOGGER.info( + f"Invalid parameters for column '{column}', falling back to default parameters." + ) + univariate = default_params["univariates"][column] + univariate["type"] = univariate_type + else: + LOGGER.debug(f"Column '{column}' has invalid parameters.") + except Exception as e: + LOGGER.error(f"Error during parameter check for column '{column}': {e}") + else: + LOGGER.debug(f"Univariate for column '{column}' does not have an _argcheck method.") + if "scale" in univariate: + univariate["scale"] = max(0, univariate["scale"]) + univariates.append(univariate) + model_parameters["univariates"] = univariates + model_parameters["columns"] = columns + correlation = model_parameters.get("correlation") + if correlation: + model_parameters["correlation"] = self._rebuild_correlation_matrix(correlation) + else: + model_parameters["correlation"] = [[1.0]] + return model_parameters + + @classmethod + def _rebuild_correlation_matrix(cls, triangular_correlation: List[List[float]]) -> List[List[float]]: + """ + Rebuild a valid correlation matrix from its lower triangular part. + + Args: + triangular_correlation (List[List[float]]): Lower triangular values (excluding the diagonal). + + Returns: + List[List[float]]: Reconstructed and adjusted full correlation matrix. + """ + size = len(triangular_correlation) + 1 + left = np.zeros((size, size)) + right = np.zeros((size, size)) + for idx, values in enumerate(triangular_correlation): + extended_values = values + [0.0] * (size - idx - 1) + left[idx + 1, :] = extended_values + right[:, idx + 1] = extended_values + correlation = left + right + max_value = np.abs(correlation).max() + if max_value > 1: + correlation /= max_value + correlation += np.identity(size) + adjusted_corr = cls._get_nearest_correlation_matrix(correlation) + return adjusted_corr.tolist() diff --git a/synthpop/method/__init__.py b/synthpop/method/__init__.py index e3db882..c338d6e 100644 --- a/synthpop/method/__init__.py +++ b/synthpop/method/__init__.py @@ -1,83 +1,10 @@ -from synthpop.method.base import Method -from synthpop.method.helpers import proper, smooth -from synthpop.method.empty import EmptyMethod -from synthpop.method.sample import SampleMethod -from synthpop.method.cart import CARTMethod -from synthpop.method.norm import NormMethod -from synthpop.method.normrank import NormRankMethod -from synthpop.method.polyreg import PolyregMethod -from synthpop.method.gaussian_copula import GaussianCopulaMethod - -EMPTY_METHOD = '' -SAMPLE_METHOD = 'sample' -# non-parametric methods -CART_METHOD = 'cart' -# parametric methods -PARAMETRIC_METHOD = 'parametric' -NORM_METHOD = 'norm' -NORMRANK_METHOD = 'normrank' -POLYREG_METHOD = 'polyreg' -GC_METHOD = 'gaussian copula' - - -METHODS_MAP = {EMPTY_METHOD: EmptyMethod, - SAMPLE_METHOD: SampleMethod, - CART_METHOD: CARTMethod, - NORM_METHOD: NormMethod, - NORMRANK_METHOD: NormRankMethod, - POLYREG_METHOD: PolyregMethod, - GC_METHOD: GaussianCopulaMethod - } - - -ALL_METHODS = (EMPTY_METHOD, SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD) -DEFAULT_METHODS = (CART_METHOD, PARAMETRIC_METHOD, GC_METHOD) -INIT_METHODS = (SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, GC_METHOD) -NA_METHODS = (CART_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD) - - -# method maps -PARAMETRIC_METHOD_MAP = {'int': NORMRANK_METHOD, - 'float': NORMRANK_METHOD, - 'datetime': NORMRANK_METHOD, - 'bool': POLYREG_METHOD, - 'category': POLYREG_METHOD - } - -CART_METHOD_MAP = {'int': CART_METHOD, - 'float': CART_METHOD, - 'datetime': CART_METHOD, - 'bool': CART_METHOD, - 'category': CART_METHOD - } - -GC_METHOD_MAP = {'int': GC_METHOD, - 'float': GC_METHOD, - 'datetime': GC_METHOD, - 'bool': GC_METHOD, - 'category': GC_METHOD - } - -SAMPLE_METHOD_MAP = {'int': SAMPLE_METHOD, - 'float': SAMPLE_METHOD, - 'datetime': SAMPLE_METHOD, - 'bool': SAMPLE_METHOD, - 'category': SAMPLE_METHOD - } - -DEFAULT_METHODS_MAP = {CART_METHOD: CART_METHOD_MAP, - PARAMETRIC_METHOD: PARAMETRIC_METHOD_MAP, - GC_METHOD: GC_METHOD_MAP - } - - -INIT_METHODS_MAP = DEFAULT_METHODS_MAP.copy() -INIT_METHODS_MAP[SAMPLE_METHOD] = SAMPLE_METHOD_MAP - - -CONT_TO_CAT_METHODS_MAP = {CART_METHOD: CART_METHOD, - NORM_METHOD: POLYREG_METHOD, - NORMRANK_METHOD: POLYREG_METHOD, - POLYREG_METHOD: POLYREG_METHOD, - GC_METHOD: GC_METHOD - } +from .cart import CARTMethod +from .GC import GaussianCopulaMethod # or from .gaussian_copula import GaussianCopulaMethod +from .helpers import proper, smooth + +__all__ = [ + "CARTMethod", + "GaussianCopulaMethod", + "proper", + "smooth", +] \ No newline at end of file diff --git a/synthpop/method/cart.py b/synthpop/method/cart.py index c5f580a..eaf227d 100644 --- a/synthpop/method/cart.py +++ b/synthpop/method/cart.py @@ -1,56 +1,151 @@ import numpy as np import pandas as pd from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +import logging +from synthpop.method.helpers import proper, smooth +from synthpop.constants import NUM_COLS_DTYPES, CAT_COLS_DTYPES -from synthpop.method import Method, proper, smooth -# global variables -from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES +# Set up logging +LOGGER = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) -class CARTMethod(Method): - def __init__(self, dtype, smoothing=False, proper=False, minibucket=5, random_state=None, *args, **kwargs): - self.dtype = dtype +class CARTMethod: + """ + + + Attributes: + metadata (dict): Mapping of column names to abstract data types + (e.g., "numerical", "categorical", "boolean", "datetime", "timedelta"). + smoothing (bool): Whether to apply smoothing to numerical predictions. + proper (bool): Whether to apply a resampling (proper) step during fitting. + minibucket (int): Minimum samples per leaf in the decision tree. + random_state (int or None): Random seed. + tree_params (dict): Additional parameters to pass to the decision tree constructors. + """ + def __init__(self, metadata, smoothing=False, proper=False, minibucket=5, random_state=None, tree_params=None): + self.metadata = metadata self.smoothing = smoothing self.proper = proper self.minibucket = minibucket self.random_state = random_state + self.tree_params = tree_params or {} + self.models = {} # Dict: column -> fitted decision tree model + self.leaf_values = {} # Dict: column -> dict mapping leaf id -> array of training y values + self.y_bounds = {} # Dict: column -> (y_real_min, y_real_max) for numerical columns + self.fitted = False + self._train_data = None # Copy of preprocessed training data - if self.dtype in CAT_COLS_DTYPES: - self.cart = DecisionTreeClassifier(min_samples_leaf=self.minibucket, random_state=self.random_state) - if self.dtype in NUM_COLS_DTYPES: - self.cart = DecisionTreeRegressor(min_samples_leaf=self.minibucket, random_state=self.random_state) - - def fit(self, X_df, y_df): - if self.proper: - X_df, y_df = proper(X_df=X_df, y_df=y_df, random_state=self.random_state) - - X_df, y_df = self.prepare_dfs(X_df=X_df, y_df=y_df, normalise_num_cols=False, one_hot_cat_cols=True) - if self.dtype in NUM_COLS_DTYPES: - self.y_real_min, self.y_real_max = np.min(y_df), np.max(y_df) - - X = X_df.to_numpy() - y = y_df.to_numpy() - self.cart.fit(X, y) - - # save the y distribution wrt trained tree nodes - leaves = self.cart.apply(X) - leaves_y_df = pd.DataFrame({'leaves': leaves, 'y': y}) - self.leaves_y_dict = leaves_y_df.groupby('leaves').apply(lambda x: x.to_numpy()[:, -1]).to_dict() - - def predict(self, X_test_df): - X_test_df, _ = self.prepare_dfs(X_df=X_test_df, normalise_num_cols=False, one_hot_cat_cols=True, fit=False) - - # predict the leaves and for each leaf randomly sample from the observed values - X_test = X_test_df.to_numpy() - leaves_pred = self.cart.apply(X_test) - y_pred = np.zeros(len(leaves_pred), dtype=object) + def fit(self, data: pd.DataFrame) -> None: + """ + Fit a CART model for each column using the remaining columns as predictors. + For numerical (and related) columns, stores the min and max of y for smoothing. + Uses the 'proper' function to optionally resample the data. + + Args: + data (pd.DataFrame): Preprocessed data. + """ + self._train_data = data.copy() + for col in data.columns: + # Prepare predictors (X) and target (y) + X = data.drop(columns=[col]) + y = data[col] + if self.proper: + X, y = proper(X_df=X, y_df=y, random_state=self.random_state) + dtype = self.metadata.get(col, "numerical") + # Choose the appropriate decision tree + if dtype in ["numerical", "datetime", "timedelta"]: + model = DecisionTreeRegressor(min_samples_leaf=self.minibucket, random_state=self.random_state, **self.tree_params) + # Store bounds for smoothing + self.y_bounds[col] = (np.min(y.to_numpy()), np.max(y.to_numpy())) + elif dtype in ["categorical", "boolean"]: + model = DecisionTreeClassifier(min_samples_leaf=self.minibucket, random_state=self.random_state, **self.tree_params) + else: + warnings.warn(f"Unknown data type for column '{col}', defaulting to regressor.") + model = DecisionTreeRegressor(min_samples_leaf=self.minibucket, random_state=self.random_state, **self.tree_params) + try: + X_np = X.to_numpy() + y_np = y.to_numpy() + model.fit(X_np, y_np) + self.models[col] = model + # Compute leaf indices for training data and group target values by leaf. + leaves = model.apply(X_np) + df_leaves = pd.DataFrame({'leaf': leaves, 'y': y_np}) + leaf_dict = df_leaves.groupby('leaf')['y'].apply(lambda arr: arr.values).to_dict() + self.leaf_values[col] = leaf_dict + except Exception as e: + LOGGER.error(f"Error fitting model for column '{col}': {e}") + self.fitted = True - leaves_pred_index_df = pd.DataFrame({'leaves_pred': leaves_pred, 'index': range(len(leaves_pred))}) - leaves_pred_index_dict = leaves_pred_index_df.groupby('leaves_pred').apply(lambda x: x.to_numpy()[:, -1]).to_dict() - for leaf, indices in leaves_pred_index_dict.items(): - y_pred[indices] = np.random.choice(self.leaves_y_dict[leaf], size=len(indices), replace=True) + def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: + """ + Generate synthetic predictions using leaf-based sampling. + For each column, the method predicts the leaf for each test row and then samples + randomly from the training values associated with that leaf. + Optionally applies smoothing to numerical columns. + + Args: + X_test (pd.DataFrame): Preprocessed predictors (should contain same columns as training data). + + Returns: + pd.DataFrame: A DataFrame with synthetic predictions for each column. + """ + if not self.fitted: + raise ValueError("The model must be fitted before prediction.") + + predictions = {} + for col, model in self.models.items(): + dtype = self.metadata.get(col, "numerical") + # Prepare predictors for this column (drop the target if present) + X = X_test.drop(columns=[col], errors='ignore') + X_np = X.to_numpy() + # Get leaf indices for test data + leaves_pred = model.apply(X_np) + y_pred = np.empty(len(leaves_pred), dtype=object) + # Group indices by leaf + leaf_indices = pd.DataFrame({'leaf': leaves_pred, 'index': range(len(leaves_pred))}) \ + .groupby('leaf')['index'].apply(list).to_dict() + for leaf, indices in leaf_indices.items(): + if leaf in self.leaf_values[col]: + samples = np.random.choice(self.leaf_values[col][leaf], size=len(indices), replace=True) + else: + # Fallback: if unseen leaf, use direct prediction. + samples = model.predict(X_np[indices]) + for i, idx in enumerate(indices): + y_pred[idx] = samples[i] + y_pred = np.array(y_pred) + # Apply smoothing if enabled and if numeric/datetime/timedelta + if self.smoothing and dtype in ["numerical", "datetime", "timedelta"]: + y_real_min, y_real_max = self.y_bounds[col] + y_pred = smooth(dtype, y_pred, y_real_min, y_real_max) + predictions[col] = y_pred + return pd.DataFrame(predictions) + + def sample(self, num_rows: int) -> pd.DataFrame: + """ + Generate synthetic data with a specified number of rows. + + The predictor sampling uses the maximum of the requested number of rows + and the size of the original training data (to ensure the trees see as much data + as possible). However, the returned DataFrame has the user-specified number of rows. + + Args: + num_rows (int): The number of synthetic samples to generate. + + Returns: + pd.DataFrame: A DataFrame containing synthetic data with num_rows rows. + """ + if not self.fitted: + raise ValueError("The model must be fitted before generating synthetic data.") + + # Use the maximum between num_rows and the original data size for predictor sampling + sample_size = max(num_rows, len(self._train_data)) + synthetic_input = self._train_data.sample(n=sample_size, replace=True, random_state=self.random_state) + + # Generate synthetic data using the predict method + synthetic_full = self.predict(synthetic_input) + + # Return only the first num_rows synthetic observations + return synthetic_full.iloc[:num_rows].reset_index(drop=True) - if self.smoothing and self.dtype in NUM_COLS_DTYPES: - y_pred = smooth(self.dtype, y_pred, self.y_real_min, self.y_real_max) - return y_pred diff --git a/synthpop/method/gaussian_copula.py b/synthpop/method/gaussian_copula.py deleted file mode 100644 index f1e60cd..0000000 --- a/synthpop/method/gaussian_copula.py +++ /dev/null @@ -1,106 +0,0 @@ -import numpy as np -import pandas as pd -from scipy.stats import norm, ks_2samp -# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor - -from synthpop.method import Method, proper, smooth -# global variables -# from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES - - -class GaussianCopulaMethod(Method): - def __init__(self, dtype, smoothing=False, proper=False, minibucket=5, random_state=None, *args, **kwargs): - self.dtype = dtype - self.proper = proper - self.random_state = random_state - self.smoothing = smoothing - self.minibucket = minibucket - - # learnt parameters - self.means = None - self.cov_matrix = None - self.scaler = None - self.data_marginals = None - - def fit(self, data): - """ - Fit the Gaussian Copula model to the given data. - """ - # Step 1: Store data marginals (quantiles for each feature) - self.data_marginals = [] - for col in data.columns: - sorted_data = np.sort(data[col]) - quantiles = np.linspace(0, 1, len(sorted_data)) - self.data_marginals.append((sorted_data, quantiles, col)) - - # Step 2: Convert data to normal distribution using CDF (Gaussianization) - uniform_data = data.rank(pct=True) # Get percentile rank for each column (empirical CDF) - gaussian_data = norm.ppf(uniform_data) # Convert uniform to standard normal - - # Step 3: Fit a multivariate Gaussian to the normalized data - self.means = gaussian_data.mean(axis=0) - self.cov_matrix = np.cov(gaussian_data, rowvar=False) - - def predict(self, n_samples): - """ - Generate synthetic data using the fitted Gaussian Copula model. - """ - # Step 1: Sample from the multivariate normal distribution - synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples) - - # Step 2: Convert back to uniform distribution using CDF (normal -> uniform) - synthetic_uniform = norm.cdf(synthetic_gaussian) - - # Step 3: Map uniform data back to the original marginals - synthetic_data = pd.DataFrame() - for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals): - synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data) - - return synthetic_data - -def evaluate_distribution(real_data, synthetic_data): - """ - Compare the distribution of each column in the real and synthetic data using - the Kolmogorov-Smirnov (KS) test. - """ - results = {} - for column in real_data.columns: - real_col = real_data[column].dropna() - synthetic_col = synthetic_data[column].dropna() - - # Perform the KS test - ks_stat, p_value = ks_2samp(real_col, synthetic_col) - - # Store the result - results[column] = {'ks_stat': ks_stat, 'p_value': p_value} - return results - -def evaluate_correlations(real_data, synthetic_data): - """ - Compare the pairwise correlation matrices of the real and synthetic data. - """ - real_corr = real_data.corr() - synthetic_corr = synthetic_data.corr() - - # Compute the difference between the correlation matrices - corr_diff = np.abs(real_corr - synthetic_corr) - return corr_diff.mean().mean() # Average correlation difference - -def run_diagnostic(real_data, synthetic_data, target_column): - """ - Run diagnostics on synthetic data by evaluating distribution, correlations, and - classification model performance. - """ - # Step 1: Evaluate distributions - distribution_results = evaluate_distribution(real_data, synthetic_data) - - # Step 2: Evaluate correlations - correlation_diff = evaluate_correlations(real_data, synthetic_data) - - # Aggregate results - diagnostics = { - 'distribution_results': distribution_results, - 'correlation_diff': correlation_diff - } - - return diagnostics \ No newline at end of file diff --git a/synthpop/method/helpers.py b/synthpop/method/helpers.py index e4c1c93..7f227e1 100644 --- a/synthpop/method/helpers.py +++ b/synthpop/method/helpers.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from scipy.stats import mode, iqr @@ -15,31 +16,26 @@ def proper(X_df=None, y_df=None, random_state=None): def smooth(dtype, y_synth, y_real_min, y_real_max): + # Ensure y_synth is numeric (float) before proceeding. + y_synth = np.asarray(y_synth, dtype=float) + indices = [True for _ in range(len(y_synth))] - # exclude from smoothing if freq for a single value higher than 70% + # Exclude from smoothing if frequency for a single value is higher than 70% y_synth_mode = mode(y_synth) if y_synth_mode.count / len(y_synth) > 0.7: indices = np.logical_and(indices, y_synth != y_synth_mode.mode) - # exclude from smoothing if data are top-coded - approximate check + # Exclude from smoothing if data are top-coded - approximate check y_synth_sorted = np.sort(y_synth) - top_coded = 10 * np.abs(y_synth_sorted[-2]) < np.abs(y_synth_sorted[-1]) - np.abs(y_synth_sorted[-2]) + top_coded = 10 * np.abs(y_synth_sorted[-2]) < np.abs(y_synth_sorted[-1] - y_synth_sorted[-2]) if top_coded: indices = np.logical_and(indices, y_synth != y_real_max) - # R version - # http://www.bagualu.net/wordpress/wp-content/uploads/2015/10/Modern_Applied_Statistics_With_S.pdf - # R default (ned0) - [link eq5.5 in p127] - this is used as the other one is not a closed formula - # R recommended (SJ) - [link eq5.6 in p129] - bw = 0.9 * len(y_synth[indices]) ** -1/5 * np.minimum(np.std(y_synth[indices]), iqr(y_synth[indices]) / 1.34) - - # # Python version - much slower as it's not a closed formula and requires a girdsearch - # bandwidths = 10 ** np.linspace(-1, 1, 10) - # grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=3, iid=False) - # grid.fit(y_synth[indices, None]) - # bw = grid.best_estimator_.bandwidth + # Compute bandwidth using the provided formula + bw = 0.9 * len(y_synth[indices]) ** (-1/5) * np.minimum(np.std(y_synth[indices]), iqr(y_synth[indices]) / 1.34) + # Apply smoothing: for values flagged by indices, sample from a normal distribution y_synth[indices] = np.array([np.random.normal(loc=value, scale=bw) for value in y_synth[indices]]) if not top_coded: y_real_max += bw @@ -48,3 +44,173 @@ def smooth(dtype, y_synth, y_real_min, y_real_max): y_synth[indices] = y_synth[indices].astype(int) return y_synth + + + +def validate_numerical_distributions(numerical_distributions, metadata_columns): + """Validate ``numerical_distributions``. + + Raise an error if it's not None or dict, or if its columns are not present in the metadata. + + Args: + numerical_distributions (dict): + Dictionary that maps field names from the table that is being modeled with + the distribution that needs to be used. + metadata_columns (list): + Columns present in the metadata. + """ + if numerical_distributions: + if not isinstance(numerical_distributions, dict): + raise TypeError('numerical_distributions can only be None or a dict instance.') + + invalid_columns = numerical_distributions.keys() - set(metadata_columns) + if invalid_columns: + raise SynthesizerInputError( + 'Invalid column names found in the numerical_distributions dictionary ' + f'{invalid_columns}. The column names you provide must be present ' + 'in the metadata.' + ) + +def warn_missing_numerical_distributions(numerical_distributions, processed_data_columns): + """Raise an `UserWarning` when numerical distribution columns don't exist anymore.""" + unseen_columns = numerical_distributions.keys() - set(processed_data_columns) + for column in unseen_columns: + warnings.warn( + f"Cannot use distribution '{numerical_distributions[column]}' for column " + f"'{column}' because the column is not statistically modeled.", + UserWarning, + ) + +def flatten_array(nested, prefix=''): + """Flatten an array as a dict. + + Args: + nested (list, numpy.array): + Iterable to flatten. + prefix (str): + Name to append to the array indices. Defaults to ``''``. + + Returns: + dict: + Flattened array. + """ + result = {} + for index in range(len(nested)): + prefix_key = '__'.join([prefix, str(index)]) if len(prefix) else str(index) + + value = nested[index] + if isinstance(value, (list, np.ndarray)): + result.update(flatten_array(value, prefix=prefix_key)) + + elif isinstance(value, dict): + result.update(flatten_dict(value, prefix=prefix_key)) + + else: + result[prefix_key] = value + + return result + + +def flatten_dict(nested, prefix=''): + """Flatten a dictionary. + + This method returns a flatten version of a dictionary, concatenating key names with + double underscores. + + Args: + nested (dict): + Original dictionary to flatten. + prefix (str): + Prefix to append to key name. Defaults to ``''``. + + Returns: + dict: + Flattened dictionary. + """ + result = {} + + for key, value in nested.items(): + prefix_key = '__'.join([prefix, str(key)]) if len(prefix) else key + + if key in IGNORED_DICT_KEYS and not isinstance(value, (dict, list)): + continue + + elif isinstance(value, dict): + result.update(flatten_dict(value, prefix_key)) + + elif isinstance(value, (np.ndarray, list)): + result.update(flatten_array(value, prefix_key)) + + else: + result[prefix_key] = value + + return result + +def unflatten_dict(flat): + """Transform a flattened dict into its original form. + + Args: + flat (dict): + Flattened dict. + + Returns: + dict: + Nested dict (if corresponds) + """ + unflattened = {} + + for key, value in sorted(flat.items(), key=_key_order): + if '__' in key: + key, subkey = key.split('__', 1) + subkey, name = subkey.rsplit('__', 1) + + if name.isdigit(): + column_index = int(name) + row_index = int(subkey) + + array = unflattened.setdefault(key, []) + + if len(array) == row_index: + row = [] + array.append(row) + elif len(array) == row_index + 1: + row = array[row_index] + else: + # This should never happen + raise ValueError('There was an error unflattening the extension.') + + if len(row) == column_index: + row.append(value) + else: + # This should never happen + raise ValueError('There was an error unflattening the extension.') + + else: + subdict = unflattened.setdefault(key, {}) + if subkey.isdigit() and key != 'univariates': + subkey = int(subkey) + + inner = subdict.setdefault(subkey, {}) + inner[name] = value + + else: + unflattened[key] = value + + return unflattened + + + +def extract_metadata(df: pd.DataFrame) -> dict: + """ + Extract metadata from a pandas DataFrame. + + Args: + df (pd.DataFrame): The input DataFrame. + + Returns: + dict: A dictionary where keys are column names and values are column types. + """ + return {col: str(df[col].dtype) for col in df.columns} + + + diff --git a/synthpop/metrics/__init__.py b/synthpop/metrics/__init__.py new file mode 100644 index 0000000..b1599ca --- /dev/null +++ b/synthpop/metrics/__init__.py @@ -0,0 +1,29 @@ +# __init__.py + +from .diagnostic_report import MetricsReport +from .efficacy_metrics import EfficacyMetrics +from .privacy_metrics import DisclosureProtection +from .single_columns_metrics import ( + category_coverage, + range_coverage, + boundary_adherence, + category_adherence, + ks_complement, + tv_complement, + statistic_similarity, + missing_value_similarity +) + +__all__ = [ + "MetricsReport", + "EfficacyMetrics", + "DisclosureProtection", + "category_coverage", + "range_coverage", + "boundary_adherence", + "category_adherence", + "ks_complement", + "tv_complement", + "statistic_similarity", + "missing_value_similarity" +] diff --git a/synthpop/metrics/diagnostic_report.py b/synthpop/metrics/diagnostic_report.py new file mode 100644 index 0000000..8c0ce4f --- /dev/null +++ b/synthpop/metrics/diagnostic_report.py @@ -0,0 +1,107 @@ +# metrics_report.py + +import pandas as pd +import numpy as np +from .single_columns_metrics import ( + category_coverage, + range_coverage, + boundary_adherence, + category_adherence, + ks_complement, + tv_complement, + statistic_similarity, + missing_value_similarity +) + +class MetricsReport: + """ + A class to produce a report comparing real and synthetic datasets with respect + to data validity and data structure. + + The report computes the following metrics for each column: + - For numerical (or datetime/timedelta) columns: + * Range Coverage: Proportion of the real data's range covered by the synthetic data. + * Boundary Adherence: Fraction of synthetic values within the real data's min/max. + * KS Complement: 1 minus the Kolmogorov-Smirnov statistic. + * TV Complement: 1 minus the Total Variation distance computed over histograms. + * Statistic Similarity: Similarity of mean, std, and median. + * Missing Value Similarity: Similarity in the proportion of missing values. + - For categorical (or boolean) columns: + * Category Coverage: Proportion of real categories found in synthetic data. + * Category Adherence: Fraction of synthetic values that are valid real categories. + * Missing Value Similarity. + + Optionally, you may provide a metadata dictionary mapping column names to abstract types. + If metadata is not provided, the type is inferred from the pandas dtype. + """ + + def __init__(self, real_df: pd.DataFrame, synthetic_df: pd.DataFrame, metadata: dict = None): + """ + Args: + real_df (pd.DataFrame): The real dataset. + synthetic_df (pd.DataFrame): The synthetic dataset. + metadata (dict, optional): Mapping from column names to types (e.g., "numerical", + "categorical", "boolean", "datetime", "timedelta"). If not provided, types are inferred. + """ + self.real_df = real_df + self.synthetic_df = synthetic_df + # If no metadata is provided, infer types based on the dtype string. + if metadata is None: + metadata = {} + for col in real_df.columns: + dtype = str(real_df[col].dtype) + if "float" in dtype or "int" in dtype: + metadata[col] = "numerical" + elif "datetime" in dtype: + metadata[col] = "datetime" + elif "timedelta" in dtype: + metadata[col] = "timedelta" + elif "bool" in dtype: + metadata[col] = "boolean" + else: + metadata[col] = "categorical" + self.metadata = metadata + + def generate_report(self) -> pd.DataFrame: + """ + Generate a report comparing the real and synthetic datasets. + + Returns: + pd.DataFrame: A DataFrame where each row corresponds to a column in the data and + contains computed metrics. Non-applicable metrics are marked as 'N/A'. + """ + report_data = [] + for col in self.real_df.columns: + col_type = self.metadata.get(col, "numerical") + real = self.real_df[col] + synthetic = self.synthetic_df[col] + col_report = {"column": col, "type": col_type} + + # Missing value similarity applies to all columns. + col_report["missing_value_similarity"] = missing_value_similarity(real, synthetic) + + # For numerical/datetime/timedelta columns, compute numerical metrics and mark categorical metrics as 'N/A' + if col_type in ["numerical", "datetime", "timedelta"]: + col_report["range_coverage"] = range_coverage(real, synthetic) + col_report["boundary_adherence"] = boundary_adherence(real, synthetic) + col_report["ks_complement"] = ks_complement(real, synthetic) + col_report["tv_complement"] = tv_complement(real, synthetic) + col_report["statistic_similarity"] = statistic_similarity(real, synthetic) + col_report["category_coverage"] = "N/A" + col_report["category_adherence"] = "N/A" + + # For categorical/boolean columns, compute categorical metrics and mark numerical metrics as 'N/A' + elif col_type in ["categorical", "boolean"]: + col_report["range_coverage"] = "N/A" + col_report["boundary_adherence"] = "N/A" + col_report["ks_complement"] = "N/A" + col_report["tv_complement"] = "N/A" + col_report["statistic_similarity"] = "N/A" + col_report["category_coverage"] = category_coverage(real, synthetic) + col_report["category_adherence"] = category_adherence(real, synthetic) + + else: + col_report["note"] = "Unknown type; metrics not computed" + + report_data.append(col_report) + return pd.DataFrame(report_data) diff --git a/synthpop/metrics/efficacy_metrics.py b/synthpop/metrics/efficacy_metrics.py new file mode 100644 index 0000000..c0edcd1 --- /dev/null +++ b/synthpop/metrics/efficacy_metrics.py @@ -0,0 +1,101 @@ +# efficacy_metrics.py + +import numpy as np +import pandas as pd +from sklearn.linear_model import LinearRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import ( + mean_squared_error, + mean_absolute_error, + r2_score, + accuracy_score, + f1_score +) +from sklearn.model_selection import train_test_split + +class EfficacyMetrics: + """ + A class to compute efficacy metrics comparing real and synthetic datasets + for downstream predictive tasks. The idea is to train a predictive model on + synthetic data and evaluate its performance on real data. The type of metrics + computed depends on the task: + + - For regression (when the target is numerical): + * Mean Squared Error (MSE) + * Mean Absolute Error (MAE) + * R^2 Score + + - For classification (when the target is categorical/boolean): + * Accuracy Score + * Weighted F1 Score + + Parameters + ---------- + task : str, optional (default='regression') + The predictive task type. Must be either 'regression' or 'classification'. + target_column : str + The name of the target column. Must exist in both real and synthetic data. + test_size : float, optional (default=0.3) + (Optional) Proportion of the real data to be used for testing. + (Note: In the default approach we train on all synthetic data and test on full real data.) + random_state : int, optional (default=42) + Random seed for reproducibility. + """ + + def __init__(self, task='regression', target_column=None, test_size=0.3, random_state=42): + if task not in ['regression', 'classification']: + raise ValueError("Task must be either 'regression' or 'classification'.") + if target_column is None: + raise ValueError("A target column must be specified.") + + self.task = task + self.target_column = target_column + self.test_size = test_size + self.random_state = random_state + + def evaluate(self, real_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> dict: + """ + Evaluate the efficacy of synthetic data by training a model on synthetic data + and testing its performance on real data. + + Args: + real_df (pd.DataFrame): The real dataset. + synthetic_df (pd.DataFrame): The synthetic dataset. + + Returns: + dict: A dictionary of performance metrics. + """ + # Verify that the target column exists in both datasets. + if self.target_column not in real_df.columns or self.target_column not in synthetic_df.columns: + raise ValueError("The target column must exist in both real and synthetic datasets.") + + # Separate features and target. + X_syn = synthetic_df.drop(columns=[self.target_column]) + y_syn = synthetic_df[self.target_column] + X_real = real_df.drop(columns=[self.target_column]) + y_real = real_df[self.target_column] + + # For the purposes of efficacy metrics, we train on synthetic data and test on real data. + if self.task == 'regression': + model = LinearRegression() + model.fit(X_syn, y_syn) + predictions = model.predict(X_real) + mse = mean_squared_error(y_real, predictions) + mae = mean_absolute_error(y_real, predictions) + r2 = r2_score(y_real, predictions) + return { + "mse": mse, + "mae": mae, + "r2": r2 + } + else: # classification + model = DecisionTreeClassifier(random_state=self.random_state) + model.fit(X_syn, y_syn) + predictions = model.predict(X_real) + accuracy = accuracy_score(y_real, predictions) + f1 = f1_score(y_real, predictions, average='weighted') + return { + "accuracy": accuracy, + "f1_score": f1 + } + diff --git a/synthpop/metrics/privacy_metrics.py b/synthpop/metrics/privacy_metrics.py new file mode 100644 index 0000000..211c315 --- /dev/null +++ b/synthpop/metrics/privacy_metrics.py @@ -0,0 +1,86 @@ +# privacy_metrics.py + +import numpy as np +import pandas as pd +from sklearn.neighbors import NearestNeighbors + +class DisclosureProtection: + """ + A class to compute the disclosure protection metric for synthetic data. + + The metric is defined as 1 minus the proportion of synthetic records that are too similar + (i.e. within a risk threshold) to a record in the real dataset. + + Parameters + ---------- + real_data : pd.DataFrame + A DataFrame containing the real data. The data should be numeric or preprocessed. + synthetic_data : pd.DataFrame + A DataFrame containing the synthetic data (with the same columns as real_data). + threshold : float, optional + A distance threshold under which a synthetic record is considered a potential disclosure risk. + If not provided, it is computed as the 10th percentile of the nearest-neighbor distances among real records. + """ + + def __init__(self, real_data: pd.DataFrame, synthetic_data: pd.DataFrame, threshold: float = None): + self.real_data = real_data.copy() + self.synthetic_data = synthetic_data.copy() + self.threshold = threshold + self._compute_threshold() + + def _compute_threshold(self): + """ + Compute the threshold if not provided. Uses the 10th percentile of the nearest-neighbor + distances among real records (excluding self-distance). + """ + if self.threshold is None: + # Fit a nearest neighbor model on the real data. + # n_neighbors=2 because the closest neighbor of a record is itself. + nn = NearestNeighbors(n_neighbors=2) + nn.fit(self.real_data) + distances, _ = nn.kneighbors(self.real_data) + # distances[:, 1] are the distances to the closest distinct record. + self.threshold = np.percentile(distances[:, 1], 10) + + def score(self) -> float: + """ + Compute the disclosure protection score. + + For each synthetic record, compute its distance to the nearest real record. + The risk rate is the proportion of synthetic records with distance below the threshold. + The disclosure protection score is 1 - risk_rate (higher is better). + + Returns + ------- + float + Disclosure protection score between 0 and 1. + """ + nn = NearestNeighbors(n_neighbors=1) + nn.fit(self.real_data) + distances, _ = nn.kneighbors(self.synthetic_data) + distances = distances.flatten() + risk_count = np.sum(distances < self.threshold) + risk_rate = risk_count / len(distances) + return 1 - risk_rate + + def report(self) -> dict: + """ + Generate a detailed report of the disclosure protection metric. + + Returns + ------- + dict + A dictionary containing the threshold, risk rate, and the final disclosure protection score. + """ + nn = NearestNeighbors(n_neighbors=1) + nn.fit(self.real_data) + distances, _ = nn.kneighbors(self.synthetic_data) + distances = distances.flatten() + risk_count = np.sum(distances < self.threshold) + risk_rate = risk_count / len(distances) + score = 1 - risk_rate + return { + "threshold": self.threshold, + "risk_rate": risk_rate, + "disclosure_protection_score": score + } diff --git a/synthpop/metrics/single_columns_metrics.py b/synthpop/metrics/single_columns_metrics.py new file mode 100644 index 0000000..38fa5ed --- /dev/null +++ b/synthpop/metrics/single_columns_metrics.py @@ -0,0 +1,253 @@ +# metrics.py + +import numpy as np +import pandas as pd +from scipy.stats import ks_2samp, iqr + +# ------------------------------------------------------------------------------ +# Coverage Metrics +# ------------------------------------------------------------------------------ + +def category_coverage(real: pd.Series, synthetic: pd.Series) -> float: + """ + Measure the proportion of categories present in the real data that are + also present in the synthetic data. + + Args: + real (pd.Series): Real (categorical) data. + synthetic (pd.Series): Synthetic (categorical) data. + + Returns: + float: Ratio (0 to 1) of real categories that are found in the synthetic data. + """ + real_cats = set(real.dropna().unique()) + synth_cats = set(synthetic.dropna().unique()) + if not real_cats: + return 1.0 + return len(real_cats.intersection(synth_cats)) / len(real_cats) + + +def range_coverage(real: pd.Series, synthetic: pd.Series) -> float: + """ + Measure the proportion of the real data's numerical range that is covered by the + synthetic data. If the data is datetime or timedelta, convert it to seconds. + + Args: + real (pd.Series): Real numerical data. + synthetic (pd.Series): Synthetic numerical data. + + Returns: + float: The ratio of the intersection length of the ranges to the real range. + """ + real_min, real_max = real.min(), real.max() + synth_min, synth_max = synthetic.min(), synthetic.max() + + # If the data is datetime, convert to seconds since epoch. + if isinstance(real_min, pd.Timestamp): + real_min = real_min.value / 1e9 # convert nanoseconds to seconds + real_max = real_max.value / 1e9 + synth_min = synth_min.value / 1e9 + synth_max = synth_max.value / 1e9 + # If the data is timedelta, convert to total seconds. + elif isinstance(real_min, pd.Timedelta): + real_min = real_min.total_seconds() + real_max = real_max.total_seconds() + synth_min = synth_min.total_seconds() + synth_max = synth_max.total_seconds() + + if real_max == real_min: + return 1.0 + intersection = max(0, min(real_max, synth_max) - max(real_min, synth_min)) + return intersection / (real_max - real_min) + + +# ------------------------------------------------------------------------------ +# Adherence Metrics +# ------------------------------------------------------------------------------ + +def boundary_adherence(real: pd.Series, synthetic: pd.Series) -> float: + """ + Measure the fraction of synthetic numerical values that lie within the boundaries + of the real data. + + Args: + real (pd.Series): Real numerical data. + synthetic (pd.Series): Synthetic numerical data. + + Returns: + float: The fraction (0 to 1) of synthetic values within [real_min, real_max]. + """ + real_min, real_max = real.min(), real.max() + adherence = ((synthetic >= real_min) & (synthetic <= real_max)).mean() + return adherence + + +def category_adherence(real: pd.Series, synthetic: pd.Series) -> float: + """ + Measure the fraction of synthetic categorical values that are present in the set + of real categories. + + Args: + real (pd.Series): Real categorical data. + synthetic (pd.Series): Synthetic categorical data. + + Returns: + float: The fraction (0 to 1) of synthetic values that are among the real categories. + """ + real_cats = set(real.dropna().unique()) + if not real_cats: + return 1.0 + adherence = synthetic.dropna().apply(lambda x: x in real_cats).mean() + return adherence + +# ------------------------------------------------------------------------------ +# Distribution/Shape Comparison Metrics +# ------------------------------------------------------------------------------ + +def ks_complement(real: pd.Series, synthetic: pd.Series) -> float: + """ + Compute the complement of the Kolmogorov-Smirnov statistic comparing the + real and synthetic data distributions. + + Args: + real (pd.Series): Real numerical data. + synthetic (pd.Series): Synthetic numerical data. + + Returns: + float: 1 - KS statistic (ranges between 0 and 1, where 1 means identical distributions). + """ + real_clean = real.dropna() + synthetic_clean = synthetic.dropna() + if len(real_clean) == 0 or len(synthetic_clean) == 0: + return 0.0 + ks_stat, _ = ks_2samp(real_clean, synthetic_clean) + return 1 - ks_stat + + +def tv_complement(real: pd.Series, synthetic: pd.Series, bins: int = 10) -> float: + """ + Compute the complement of the Total Variation (TV) distance between the histograms + of the real and synthetic data. A value of 1 indicates identical distributions. + + If the data is datetime or timedelta, convert it to numeric values (in seconds). + + Args: + real (pd.Series): Real numerical data. + synthetic (pd.Series): Synthetic numerical data. + bins (int, optional): Number of bins to use for the histograms. Defaults to 10. + + Returns: + float: 1 - TV distance, where TV is computed over the normalized histograms. + """ + real_clean = real.dropna() + synthetic_clean = synthetic.dropna() + + if len(real_clean) == 0 or len(synthetic_clean) == 0: + return 0.0 + + # Convert datetime/timedelta to numeric values if necessary. + if np.issubdtype(real_clean.dtype, np.datetime64): + # Convert to seconds since epoch + real_clean = real_clean.astype('int64') / 1e9 + synthetic_clean = synthetic_clean.astype('int64') / 1e9 + elif np.issubdtype(real_clean.dtype, np.timedelta64): + # Convert to total seconds + if hasattr(real_clean, 'dt'): + real_clean = real_clean.dt.total_seconds() + synthetic_clean = synthetic_clean.dt.total_seconds() + else: + real_clean = real_clean.astype('int64') / 1e9 + synthetic_clean = synthetic_clean.astype('int64') / 1e9 + + all_data = pd.concat([real_clean, synthetic_clean]) + bin_edges = np.histogram_bin_edges(all_data, bins=bins) + real_hist, _ = np.histogram(real_clean, bins=bin_edges, density=True) + synth_hist, _ = np.histogram(synthetic_clean, bins=bin_edges, density=True) + + # Normalize the histograms + real_hist = real_hist / np.sum(real_hist) + synth_hist = synth_hist / np.sum(synth_hist) + + tv_distance = 0.5 * np.sum(np.abs(real_hist - synth_hist)) + return 1 - tv_distance + + +# ------------------------------------------------------------------------------ +# Statistical Similarity Metrics +# ------------------------------------------------------------------------------ + +def statistic_similarity(real: pd.Series, synthetic: pd.Series) -> float: + """ + Compare basic statistics (mean, standard deviation, and median) of the real and + synthetic data and return an average similarity score between 0 and 1 (1 means perfect similarity). + + If the data is datetime or timedelta, it is converted to a numeric representation (seconds). + + Args: + real (pd.Series): Real data. + synthetic (pd.Series): Synthetic data. + + Returns: + float: Similarity score between 0 and 1. + """ + real_clean = real.dropna() + synthetic_clean = synthetic.dropna() + if len(real_clean) == 0 or len(synthetic_clean) == 0: + return 0.0 + + eps = 1e-8 # small constant to avoid division by zero + + # Convert datetime/timedelta to numeric values (in seconds) + if np.issubdtype(real_clean.dtype, np.datetime64): + real_vals = real_clean.astype('int64') / 1e9 + synth_vals = synthetic_clean.astype('int64') / 1e9 + elif np.issubdtype(real_clean.dtype, np.timedelta64): + # Use the .dt accessor if available + if hasattr(real_clean, 'dt'): + real_vals = real_clean.dt.total_seconds() + synth_vals = synthetic_clean.dt.total_seconds() + else: + real_vals = real_clean.astype('int64') / 1e9 + synth_vals = synthetic_clean.astype('int64') / 1e9 + else: + real_vals = real_clean + synth_vals = synthetic_clean + + stats = ['mean', 'std', 'median'] + real_stats = { + 'mean': real_vals.mean(), + 'std': real_vals.std(), + 'median': real_vals.median() + } + synth_stats = { + 'mean': synth_vals.mean(), + 'std': synth_vals.std(), + 'median': synth_vals.median() + } + + similarities = [] + for stat in stats: + diff = abs(real_stats[stat] - synth_stats[stat]) + denom = abs(real_stats[stat]) + eps + sim = 1 - (diff / denom) + sim = max(0, min(1, sim)) + similarities.append(sim) + return np.mean(similarities) + + + +def missing_value_similarity(real: pd.Series, synthetic: pd.Series) -> float: + """ + Compare the proportion of missing values (NaNs) in the real and synthetic data. + + Args: + real (pd.Series): Real data. + synthetic (pd.Series): Synthetic data. + + Returns: + float: 1 minus the absolute difference in missing value proportions (ranges from 0 to 1). + """ + real_missing = real.isna().mean() + synth_missing = synthetic.isna().mean() + return 1 - abs(real_missing - synth_missing) + diff --git a/synthpop/processor/__init__.py b/synthpop/processor/__init__.py index 3ef0883..b486b97 100644 --- a/synthpop/processor/__init__.py +++ b/synthpop/processor/__init__.py @@ -1,2 +1,4 @@ -from synthpop.processor.processor import Processor -from synthpop.processor.processor import NAN_KEY, NUMTOCAT_KEY +from synthpop.processor.data_processor import DataProcessor +from synthpop.processor.missing_data_handler import MissingDataHandler + +__all__ = ['DataProcessor', 'MissingDataHandler'] diff --git a/synthpop/processor/data_processor.py b/synthpop/processor/data_processor.py new file mode 100644 index 0000000..34e5a39 --- /dev/null +++ b/synthpop/processor/data_processor.py @@ -0,0 +1,141 @@ +import pandas as pd +import numpy as np +import warnings +import logging +from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler + +# Set up logging +LOGGER = logging.getLogger(__name__) + +class InvalidDataError(Exception): + """Custom exception for invalid data errors.""" + pass + +class DataProcessor: + """Preprocess and post-process data before and after synthetic data generation. + + Handles: + - Type conversions (categorical ↔ numerical). + - Feature transformations for Gaussian Copula. + - Reverse transformations to restore original data types. + """ + + def __init__(self, metadata, enforce_rounding=True, enforce_min_max_values=True, model_kwargs=None, table_name=None, locales=['en_US']): + self.metadata = metadata + self.enforce_rounding = enforce_rounding + self.enforce_min_max_values = enforce_min_max_values + self.model_kwargs = model_kwargs or {} + self.table_name = table_name + self.locales = locales + self._fitted = False + self._prepared_for_fitting = False + self.encoders = {} # Stores encoders for categorical columns + self.scalers = {} # Stores scalers for numerical columns + self.original_columns = None # To restore column order + + def preprocess(self, data: pd.DataFrame) -> pd.DataFrame: + """Transform the raw data into numerical space.""" + if self._fitted: + warnings.warn( + "This model has already been fitted. To use new preprocessed data, " + "please refit the model using 'fit'." + ) + + self.validate(data) + self.original_columns = data.columns # Store original column order + processed_data = self._preprocess(data) + + return processed_data + + def _preprocess(self, data: pd.DataFrame) -> pd.DataFrame: + """Handles encoding, scaling.""" + data = data.copy() + + for col, dtype in self.metadata.items(): + if dtype == "categorical": + # Use Label Encoding for small categories, OneHot for larger + encoder = LabelEncoder() if len(data[col].unique()) < 10 else OneHotEncoder(sparse=False, drop="first") + transformed_data = self._encode_categorical(data[col], encoder) + self.encoders[col] = encoder + data.drop(columns=[col], inplace=True) + data = pd.concat([data, transformed_data], axis=1) + + elif dtype == "numerical": + scaler = StandardScaler() + data[col] = scaler.fit_transform(data[[col]]) + self.scalers[col] = scaler + + elif dtype == "boolean": + data[col] = data[col].astype(int) # Convert True/False to 1/0 + + elif dtype == "datetime": + data[col] = data[col].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan) # Convert to Unix timestamp + + elif dtype == "timedelta": + data[col] = pd.to_timedelta(data[col]).dt.total_seconds() + + return data + + def postprocess(self, synthetic_data: pd.DataFrame) -> pd.DataFrame: + """Transform numerical synthetic data back to its original format.""" + synthetic_data = synthetic_data.copy() + + for col, dtype in self.metadata.items(): + if dtype == "categorical" and col in self.encoders: + encoder = self.encoders[col] + synthetic_data[col] = self._decode_categorical(synthetic_data[col], encoder) + + elif dtype == "numerical" and col in self.scalers: + scaler = self.scalers[col] + synthetic_data[col] = scaler.inverse_transform(synthetic_data[[col]]) + + elif dtype == "boolean": + synthetic_data[col] = synthetic_data[col].round().astype(bool) + + elif dtype == "datetime": + synthetic_data[col] = pd.to_datetime(synthetic_data[col], unit='s') + + elif dtype == "timedelta": + synthetic_data[col] = pd.to_timedelta(synthetic_data[col], unit='s') + + return synthetic_data[self.original_columns] # Restore original column order + + def validate(self, data: pd.DataFrame): + """Validate input data.""" + if not isinstance(data, pd.DataFrame): + raise ValueError("Input data must be a pandas DataFrame.") + + missing_columns = set(self.metadata.keys()) - set(data.columns) + if missing_columns: + raise InvalidDataError(f"Missing columns: {missing_columns}") + + primary_keys = [col for col, dtype in self.metadata.items() if dtype == "primary_key"] + for key in primary_keys: + if data[key].duplicated().any(): + raise InvalidDataError(f"Primary key '{key}' is not unique.") + + def _encode_categorical(self, series: pd.Series, encoder): + """Encode categorical columns.""" + if isinstance(encoder, LabelEncoder): + return pd.DataFrame(encoder.fit_transform(series), columns=[series.name]) + elif isinstance(encoder, OneHotEncoder): + encoded_array = encoder.fit_transform(series.values.reshape(-1, 1)) + encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([series.name])) + return encoded_df + + def _decode_categorical(self, series: pd.Series, encoder): + """Decode categorical columns.""" + if isinstance(encoder, LabelEncoder): + return encoder.inverse_transform(series.astype(int)) + elif isinstance(encoder, OneHotEncoder): + category_index = np.argmax(series.values, axis=1) + return encoder.categories_[0][category_index] + + def _handle_missing_values(self, series: pd.Series): + """Handle missing values based on column type.""" + if series.dtype in ["float64", "int64"]: + return series.fillna(series.median()) + elif series.dtype == "object": + return series.fillna(series.mode()[0]) + else: + return series.fillna(0) diff --git a/synthpop/processor/missing_data_handler.py b/synthpop/processor/missing_data_handler.py new file mode 100644 index 0000000..1f03885 --- /dev/null +++ b/synthpop/processor/missing_data_handler.py @@ -0,0 +1,277 @@ +import numpy as np +import pandas as pd +import scipy.stats as stats +from sklearn.experimental import enable_iterative_imputer # For MICE and EM +from sklearn.impute import SimpleImputer, IterativeImputer +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import LabelEncoder +import warnings + + +class MissingDataHandler: + """Detects missingness type (MCAR, MAR, MNAR) and applies automatic imputation.""" + + def __init__(self): + self.imputers = {} + + @staticmethod + def get_column_dtypes(data) -> dict: + """ + Returns a dictionary mapping column names to abstract data types + that are compatible with the processor. + + The mapping is as follows: + - float64, float32, int64, int32 -> "numerical" + - bool -> "boolean" + - datetime64[...] -> "datetime" + - timedelta64[...] -> "timedelta" + - All others (e.g., object) -> "categorical" + """ + def map_dtype(dtype: str) -> str: + if dtype in ['float64', 'float32', 'int64', 'int32']: + return "numerical" + elif dtype == 'bool': + return "boolean" + elif 'datetime' in dtype: + return "datetime" + elif 'timedelta' in dtype: + return "timedelta" + else: + return "categorical" + + if isinstance(data, pd.DataFrame): + return {col: map_dtype(str(dtype)) for col, dtype in data.dtypes.items()} + elif isinstance(data, np.ndarray) and data.dtype.names is not None: + return {name: map_dtype(str(data.dtype.fields[name][0])) for name in data.dtype.names} + else: + raise TypeError("Data must be a pandas DataFrame or a structured numpy array.") + + def encode_predictors( + self, df: pd.DataFrame, drop_cols: list = None + ) -> pd.DataFrame: + """ + Encodes all columns in the DataFrame so that they are numeric. + Optionally, drops specified columns (e.g., the target column). + + Steps: + 1. Extract numeric columns. + 2. Convert datetime columns to Unix timestamp (numeric). + 3. Convert timedelta columns to total seconds as float. + 4. For categorical columns (object, category), create dummy variables. + 5. For boolean columns, convert to int (0/1). + 6. Concatenate everything and fill any remaining NaNs with each column's median. + + Args: + df (pd.DataFrame): Input DataFrame. + drop_cols (list): List of column names to drop (optional). + + Returns: + pd.DataFrame: DataFrame with only numeric values and no missing entries. + """ + df_work = df.copy() + if drop_cols is not None: + df_work = df_work.drop(columns=drop_cols) + + # 1. Extract numeric columns. + num_df = df_work.select_dtypes(include=[np.number], exclude = ["timedelta64[ns]"]).copy() + + # 2. Convert datetime columns to Unix timestamp (numeric). + datetime_cols = df_work.select_dtypes(include=["datetime64[ns]"]) + if not datetime_cols.empty: + datetime_numeric = datetime_cols.apply( + lambda col: col.astype(np.int64) // 10**9 + ) + num_df = pd.concat([num_df, datetime_numeric], axis=1) + + # 3. Convert timedelta columns to total seconds (as float). + timedelta_cols = df_work.select_dtypes(include=["timedelta64[ns]"]) + if not timedelta_cols.empty: + timedelta_numeric = pd.DataFrame({ + col: timedelta_cols[col].dt.total_seconds() for col in timedelta_cols.columns + }, index=df_work.index) + num_df = pd.concat([num_df, timedelta_numeric], axis=1) + + + # 4. Encode categorical columns using get_dummies. + cat_df = df_work.select_dtypes(include=["object", "category"]) + if not cat_df.empty: + dummies = pd.get_dummies(cat_df, drop_first=True) + else: + dummies = pd.DataFrame(index=df_work.index) + + # 5. Handle boolean columns: convert them to int explicitly. + bool_df = df_work.select_dtypes(include=["bool"]).astype(int) + + # 6. Concatenate all predictors and fill any remaining missing values with the median. + result_df = pd.concat([num_df, dummies, bool_df], axis=1) + result_df = result_df.apply(lambda col: col.fillna(0), axis=0) + return result_df + + def detect_missingness(self, dfc: pd.DataFrame) -> dict: + """Detects missingness type for each column, handling multiple data types.""" + df = dfc.copy() + missingness = {} + for col in df.columns: + missing_values = df[col].isna().sum() + if missing_values == 0: + continue # No missing values → Skip detection + col_type = df[col].dtype + + # **Categorical Data Handling (object, category)** + if col_type == "object" or df[col].nunique() < 10: + observed_counts = df[col].dropna().value_counts() + if len(observed_counts) > 1: + _, p_value = stats.chisquare(observed_counts) + if p_value > 0.05: + missingness[col] = "MCAR" + continue + + missing_mask = df[col].isna().astype(int) + # Use our helper to encode all predictors (drop the target col) + encoded_data = self.encode_predictors(df, drop_cols=[col]) + model = LogisticRegression() + model.fit(encoded_data, missing_mask) + if model.score(encoded_data, missing_mask) > 0.6: + missingness[col] = "MAR" + continue + missingness[col] = "MNAR" + continue + + # **Numerical Data Handling (int, float)** + elif np.issubdtype(col_type, np.number): + _, p_value = stats.shapiro(df[col].dropna()) + if p_value > 0.05: + missingness[col] = "MCAR" + continue + missing_mask = df[col].isna().astype(int) + observed_data = self.encode_predictors(df, drop_cols=[col]) + model = LogisticRegression() + model.fit(observed_data, missing_mask) + if model.score(observed_data, missing_mask) > 0.6: + missingness[col] = "MAR" + continue + observed_values = df[col].dropna() + missing_rows = df[col].isna() + if missing_rows.sum() > 0: + encoded_missing_vals = self.encode_predictors(df.loc[missing_rows, df.columns != col]) + missing_vals = encoded_missing_vals.mean(axis=1) + _, p_value = stats.ks_2samp(observed_values, missing_vals) + if p_value < 0.05: + missingness[col] = "MNAR" + continue + missingness[col] = "MAR" + continue + + # **Boolean Data Handling (bool)** + elif np.issubdtype(col_type, np.bool_): + bool_as_int = df[col].astype(float) + _, p_value = stats.chisquare(bool_as_int.value_counts()) + if p_value > 0.05: + missingness[col] = "MCAR" + continue + missingness[col] = "MNAR" + continue + + # **Datetime Handling (datetime64)** + elif np.issubdtype(col_type, np.datetime64): + timestamps = df[col].dropna().astype(int) // 10**9 + _, p_value = stats.shapiro(timestamps) + if p_value > 0.05: + missingness[col] = "MCAR" + continue + missing_mask = df[col].isna().astype(int) + observed_data = self.encode_predictors(df, drop_cols=[col]) + model = LogisticRegression() + model.fit(observed_data, missing_mask) + if model.score(observed_data, missing_mask) > 0.6: + missingness[col] = "MAR" + continue + missingness[col] = "MNAR" + continue + + # **Timedelta Handling (timedelta64)** + elif np.issubdtype(col_type, np.timedelta64): + durations = df[col].dropna().dt.total_seconds() + _, p_value = stats.shapiro(durations) + if p_value > 0.05: + missingness[col] = "MCAR" + continue + missingness[col] = "MNAR" + continue + + return missingness + + def apply_imputation(self, df: pd.DataFrame, missingness: dict) -> pd.DataFrame: + """Automatically applies imputation based on missingness type and column data type.""" + df = df.copy() + for col, mtype in missingness.items(): + if df[col].isna().sum() == 0: + continue + + # --- Categorical Data (object, category or few unique values) --- + if ( + pd.api.types.is_object_dtype(df[col]) + or pd.api.types.is_categorical_dtype(df[col]) + or (df[col].nunique() < 10) + ): + if mtype == "MCAR": + df[col].fillna(df[col].mode()[0], inplace=True) + elif mtype == "MAR": + # Use get_dummies encoding for categorical data + dummies = pd.get_dummies(df[col], prefix=col, dummy_na=True) + imputer = IterativeImputer(random_state=42) + imputed = imputer.fit_transform(dummies) + imputed_rounded = np.rint(imputed).astype(int) + imputed_df = pd.DataFrame( + imputed_rounded, columns=dummies.columns, index=df.index + ) + # Convert back to a single categorical column by taking the column with the maximum value. + predicted_category = imputed_df.idxmax(axis=1) + df[col] = predicted_category.str.split(f"{col}_").str[-1] + elif mtype == "MNAR": + df[col].fillna("Missing", inplace=True) + + # --- Numerical Data --- + elif pd.api.types.is_numeric_dtype(df[col]): + if mtype == "MCAR": + imputer = SimpleImputer(strategy="mean") + df[col] = imputer.fit_transform(df[[col]]).ravel() + elif mtype in ["MAR", "MNAR"]: + imputer = IterativeImputer(random_state=42) + df[col] = imputer.fit_transform(df[[col]]).ravel() + + # --- Boolean Data --- + elif pd.api.types.is_bool_dtype(df[col]): + if mtype == "MCAR": + df[col].fillna(df[col].mode()[0], inplace=True) + elif mtype in ["MAR", "MNAR"]: + numeric_vals = df[col].astype(float) + imputer = IterativeImputer(random_state=42) + imputed = imputer.fit_transform(numeric_vals.values.reshape(-1, 1)) + df[col] = np.rint(imputed).astype(bool).flatten() + + # --- Datetime Data --- + elif pd.api.types.is_datetime64_any_dtype(df[col]): + print("entering here") + numeric_series = df[col].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan) + if mtype == "MCAR": + imputer = SimpleImputer(strategy="median") + elif mtype in ["MAR", "MNAR"]: + imputer = IterativeImputer(random_state=42) + imputed_numeric = imputer.fit_transform( + numeric_series.values.reshape(-1, 1) + ) + df[col] = pd.to_datetime(imputed_numeric.flatten(), unit='s') + + # --- Timedelta Data --- + elif pd.api.types.is_timedelta64_dtype(df[col]): + numeric_series = df[col].apply(lambda x: x.total_seconds() if pd.notnull(x) else np.nan).values.reshape(-1, 1) + if mtype == "MCAR": + imputer = SimpleImputer(strategy="median" ) + elif mtype in ["MAR", "MNAR"]: + imputer = IterativeImputer(random_state=42) + imputed_numeric = imputer.fit_transform(numeric_series) + df[col] = pd.to_timedelta(imputed_numeric.flatten(), unit="s") + else: + df[col].fillna(df[col].mode()[0], inplace=True) + return df diff --git a/synthpop/processor/processor.py b/synthpop/processor/processor.py deleted file mode 100644 index b2065c3..0000000 --- a/synthpop/processor/processor.py +++ /dev/null @@ -1,122 +0,0 @@ -import numpy as np -import pandas as pd - -# global variables -from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES - -NAN_KEY = 'nan' -NUMTOCAT_KEY = 'numtocat' - - -class Processor: - def __init__(self, spop): - self.spop = spop - self.processing_dict = {NUMTOCAT_KEY: {}, - NAN_KEY: {} - } - - - def preprocess(self, df, dtypes): - for col in self.spop.visited_columns: - col_nan_indices = df[col].isna() - cont_nan_indices = {v: df[col] == v for v in self.spop.cont_na.get(col, [])} - col_nan_series = [(np.nan, col_nan_indices)] + list(cont_nan_indices.items()) - - col_all_nan_indices = pd.DataFrame({index: value[1] for index, value in enumerate(col_nan_series)}).max(axis=1) - col_not_nan_indices = np.invert(col_all_nan_indices) - - # transform numerical columns in numtocat to categorical - if col in self.spop.numtocat: - self.processing_dict[NUMTOCAT_KEY][col] = {'dtype': self.spop.df_dtypes[col], - 'categories': {} - } - - # Dealing With Non-NaN Values - not_nan_values = df.loc[col_not_nan_indices, col].copy() - df.loc[col_not_nan_indices, col] = pd.cut(df.loc[col_not_nan_indices, col], self.spop.catgroups[col], labels=range(self.spop.catgroups[col]), include_lowest=True) - - grouped = pd.DataFrame({'grouped': df.loc[col_not_nan_indices, col], 'real': not_nan_values}).groupby('grouped') - self.processing_dict[NUMTOCAT_KEY][col]['categories'] = grouped['real'].apply(np.array).to_dict() - - # Dealing with NaN - for index, (_, bool_series) in enumerate(col_nan_series): - nan_cat = self.spop.catgroups[col] + index - self.processing_dict[NUMTOCAT_KEY][col]['categories'][nan_cat] = df.loc[bool_series, col].to_numpy() - df.loc[bool_series, col] = nan_cat - - df[col] = df[col].astype('category') - self.spop.df_dtypes[col] = 'category' - - else: - # NaNs in category columns - # need to process NaNs only as all other categories will be taken care automatically - if self.spop.df_dtypes[col] in CAT_COLS_DTYPES: - if col_nan_indices.any(): - # TODO beware of 'NaN_category' naming - col_nan_category = 'NaN_category' - self.processing_dict[NAN_KEY][col] = {'dtype': self.spop.df_dtypes[col], - 'nan_value': col_nan_category - } - - df[col] = df[col].cat.add_categories(col_nan_category) #argument 'inplace' is deprecated and removed - df[col].fillna(col_nan_category, inplace=True) - - # NaNs in numerical columns - elif self.spop.df_dtypes[col] in NUM_COLS_DTYPES: - if col_all_nan_indices.any(): - # insert new column in df - # TODO beware of '_NaN' naming - col_nan_name = col + '_NaN' - df.insert(df.columns.get_loc(col), col_nan_name, 0) #inserts columName_NaN in dataframe - - self.processing_dict[NAN_KEY][col] = {'col_nan_name': col_nan_name, - 'dtype': self.spop.df_dtypes[col], - 'nan_flags': {} - } - - for index, (cat, bool_series) in enumerate(col_nan_series): - cat_index = index + 1 - self.processing_dict[NAN_KEY][col]['nan_flags'][cat_index] = cat - df.loc[bool_series, col_nan_name] = cat_index - df.loc[col_all_nan_indices, col] = 0 - - df.loc[:,col_nan_name] = df[col_nan_name].astype('category') - self.spop.df_dtypes[col_nan_name] = 'category' - - return df - - def postprocess(self, synth_df): - #sex_NaN is not a column of synth_df - for col, processing_numtocat_col_dict in self.processing_dict[NUMTOCAT_KEY].items(): - synth_df[col] = synth_df[col].astype(object) - col_synth_df = synth_df[col].copy() - - for category, category_values in processing_numtocat_col_dict['categories'].items(): - category_indices = col_synth_df == category - synth_df.loc[category_indices, col] = np.random.choice(category_values, size=category_indices.sum(), replace=True) - - # cast dtype back to original (float for int column with NaNs) - if synth_df[col].isna().any() and processing_numtocat_col_dict['dtype'] == 'int': - synth_df[col] = synth_df[col].astype(float) - else: - synth_df[col] = synth_df[col].astype(processing_numtocat_col_dict['dtype']) - # self.spop.df_dtypes[col] = processing_numtocat_col_dict['dtype'] - - for col, processing_nan_col_dict in self.processing_dict[NAN_KEY].items(): - # NaNs in category columns - # need to postprocess NaNs only all other categories will be taken care automatically - if processing_nan_col_dict['dtype'] in CAT_COLS_DTYPES: - col_nan_value = processing_nan_col_dict['nan_value'] - synth_df[col] = synth_df[col].astype(object) - synth_df.loc[synth_df[col] == col_nan_value, col] = np.nan - synth_df[col] = synth_df[col].astype('category') - - # NaNs in numerical columns - #The code below sets changes NANs in numerical columns to a given value, and removes the NAN indicator column. - elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES: - for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items(): - nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set - synth_df.loc[nan_flag_indices, col] = col_nan_value - synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True) - - return synth_df diff --git a/synthpop/synthpop.py b/synthpop/synthpop.py deleted file mode 100644 index 03a4051..0000000 --- a/synthpop/synthpop.py +++ /dev/null @@ -1,203 +0,0 @@ -import numpy as np -import pandas as pd - -# classes -from synthpop.validator import Validator -from synthpop.processor import Processor -# global variables -from synthpop import NUM_COLS_DTYPES -from synthpop.processor import NAN_KEY -from synthpop.method import CART_METHOD, GC_METHOD, METHODS_MAP, NA_METHODS - - -class Synthpop: - def __init__(self, - method=None, - visit_sequence=None, - # predictor_matrix=None, - proper=False, - cont_na=None, - smoothing=False, - default_method=CART_METHOD, - numtocat=None, - catgroups=None, - seed=None): - # initialise the validator and processor - self.validator = Validator(self) - self.processor = Processor(self) - - # initialise arguments - self.method = method - self.visit_sequence = visit_sequence - self.predictor_matrix = None - self.proper = proper - self.cont_na = cont_na - self.smoothing = smoothing - self.default_method = default_method - self.numtocat = numtocat - self.catgroups = catgroups - self.seed = seed - self.map_column_to_NaN_column = {} - # check init - self.validator.check_init() - - def include_nan_columns(self): - for (col,nan_col) in self.map_column_to_NaN_column.items(): - if col not in self.visit_sequence: - continue - - index_of_col = self.visit_sequence.index(col) - self.visit_sequence.insert(index_of_col,nan_col) - - def pre_preprocess(self,df,dtypes,nan_fill): - for column in df: - if dtypes[column] != 'float': - continue - maybe_nans = df[column].isnull() - if not maybe_nans.any(): - continue - - df.loc[maybe_nans,column] = nan_fill - - nan_col_name = column+"_NaN" - df.loc[:,nan_col_name] = maybe_nans - self.map_column_to_NaN_column[column] = nan_col_name - - dtypes[nan_col_name] = 'category' - - return df,dtypes - - def post_postprocessing(self,syn_df): - for column in syn_df: - if column in self.map_column_to_NaN_column.keys(): - nan_col_name = self.map_column_to_NaN_column[column] - column_NaN_at = syn_df[nan_col_name] - syn_df.loc[column_NaN_at,column] = None - syn_df = syn_df.drop(columns=nan_col_name) - - return syn_df - - def _infer_dtypes(self, df): - """Automatically infer data types from DataFrame. - - Args: - df: pandas DataFrame - - Returns: - dict: Mapping of column names to inferred types ('int', 'float', 'datetime', 'category', 'bool') - """ - dtypes = {} - for column in df.columns: - pd_dtype = str(df[column].dtype) - - if pd_dtype.startswith('int'): - dtypes[column] = 'int' - elif pd_dtype.startswith('float'): - dtypes[column] = 'float' - elif pd_dtype.startswith('datetime'): - dtypes[column] = 'datetime' - elif pd_dtype.startswith('bool'): - dtypes[column] = 'bool' - else: - # For object or string dtypes, check if it should be categorical - dtypes[column] = 'category' - - return dtypes - - def fit(self, df, dtypes=None): - """Fit the synthetic data generator. - - Args: - df: pandas DataFrame to learn from - dtypes: Optional dict mapping column names to types. If not provided, types will be inferred. - """ - # Infer dtypes if not provided - if dtypes is None: - dtypes = self._infer_dtypes(df) - - # Validate DataFrame - if not df.columns.is_unique: - raise ValueError("DataFrame column names must be unique") - - df,dtypes = self.pre_preprocess(df,dtypes,-8) - - self.df_columns = df.columns.tolist() - # Only set visit_sequence if not provided in init - if self.visit_sequence is None: - self.visit_sequence = df.columns.tolist() - elif isinstance(self.visit_sequence, list) and all(isinstance(x, int) for x in self.visit_sequence): - # Convert numeric indices to column names - self.visit_sequence = [df.columns[i] for i in self.visit_sequence] - - self.include_nan_columns() - self.n_df_rows, self.n_df_columns = np.shape(df) - self.df_dtypes = dtypes - - # check processor - self.validator.check_processor() - # preprocess - - #processor.preprocess has side effects on the processor object and on this (self) object - #processor.processing_dict[NAN_KEY][col] - #spop.df_dtypes[col_nan_name] - processed_df = self.processor.preprocess(df, self.df_dtypes) - print(processed_df) - self.processed_df_columns = processed_df.columns.tolist() - self.n_processed_df_columns = len(self.processed_df_columns) - - # check fit - self.validator.check_fit() - # fit - self._fit(processed_df) - - def _fit(self, df): - self.saved_methods = {} - - # train - self.predictor_matrix_columns = self.predictor_matrix.columns.to_numpy() - for col, visit_step in self.visit_sequence.sort_values().items(): - print('train_{}'.format(col)) - - # initialise the method - col_method = METHODS_MAP[self.method[col]](dtype=self.df_dtypes[col], smoothing=self.smoothing[col], proper=self.proper, random_state=self.seed) - # fit the method - col_predictors = self.predictor_matrix_columns[self.predictor_matrix.loc[col].to_numpy() == 1] - col_method.fit(X_df=df[col_predictors], y_df=df[col]) - # save the method - self.saved_methods[col] = col_method - - def generate(self, k=None): - self.k = k - - # check generate - self.validator.check_generate() - # generate - synth_df = self._generate() - # postprocess - processed_synth_df = self.processor.postprocess(synth_df) - - return self.post_postprocessing(processed_synth_df) - - def _generate(self): - # Only generate columns that were in the visit sequence - synth_df = pd.DataFrame(data=np.zeros([self.k, len(self.visit_sequence)]), columns=self.visit_sequence.index) - - for col, visit_step in self.visit_sequence.sort_values().items(): - print('generate_{}'.format(col)) - - # reload the method - col_method = self.saved_methods[col] - # predict with the method - col_predictors = self.predictor_matrix_columns[self.predictor_matrix.loc[col].to_numpy() == 1] - synth_df[col] = col_method.predict(synth_df[col_predictors]) - - # change all missing values to 0 - if col in self.processor.processing_dict[NAN_KEY] and self.df_dtypes[col] in NUM_COLS_DTYPES and self.method[col] in NA_METHODS: - nan_indices = synth_df[self.processor.processing_dict[NAN_KEY][col]['col_nan_name']] != 0 - synth_df.loc[nan_indices, col] = 0 - - # map dtype to original dtype (only excpetion if column is full of NaNs) - if synth_df[col].notna().any(): - synth_df[col] = synth_df[col].astype(self.df_dtypes[col]) - - return synth_df \ No newline at end of file diff --git a/synthpop/validator/__init__.py b/synthpop/validator/__init__.py index 3143690..d55577d 100644 --- a/synthpop/validator/__init__.py +++ b/synthpop/validator/__init__.py @@ -1 +1,5 @@ -from synthpop.validator.validator import Validator +from .validator import Validator + +__all__ = [ + "Validator", +] \ No newline at end of file diff --git a/synthpop/validator/validator.py b/synthpop/validator/validator.py index aa71005..f54852f 100644 --- a/synthpop/validator/validator.py +++ b/synthpop/validator/validator.py @@ -1,322 +1,5 @@ -import numpy as np -import pandas as pd - -# global variables -from synthpop import NUM_COLS_DTYPES -from synthpop.method import EMPTY_METHOD, SAMPLE_METHOD -from synthpop.method import DEFAULT_METHODS_MAP, INIT_METHODS_MAP, CONT_TO_CAT_METHODS_MAP -from synthpop.method import ALL_METHODS, INIT_METHODS, DEFAULT_METHODS, NA_METHODS -from synthpop.processor import NAN_KEY - - -INIT_STEP = 'init' -PROCESSOR_STEP = 'processor' -FIT_STEP = 'fit' -GENERATE_STEP = 'generate' - -NONE_TYPE = type(None) - -DENSITY = 'density' class Validator: - def __init__(self, spop): - self.spop = spop - self.attributes_types = {'method': (NONE_TYPE, str, list), - 'visit_sequence': (NONE_TYPE, np.ndarray, list), - # 'predictor_matrix': (NONE_TYPE,), - 'proper': (bool,), - 'cont_na': (NONE_TYPE, dict), - 'smoothing': (bool, str, dict), - 'default_method': (str,), - 'numtocat': (NONE_TYPE, list), - 'catgroups': (NONE_TYPE, int, dict), - 'seed': (NONE_TYPE, int), - 'k': (NONE_TYPE, int)} - - def check_init(self): - step = INIT_STEP - - self.default_method_validator(step=step) - self.method_validator(step=step) - self.visit_sequence_validator(step=step) - self.predictor_matrix_validator(step=step) - self.proper_validator(step=step) - self.cont_na_validator(step=step) - self.smoothing_validator(step=step) - self.numtocat_validator(step=step) - self.catgroups_validator(step=step) - self.seed_validator(step=step) - - def check_processor(self): - step = PROCESSOR_STEP - - self.visit_sequence_validator(step=step) - self.method_validator(step=step) - self.predictor_matrix_validator(step=step) - self.smoothing_validator(step=step) - - self.cont_na_validator(step=step) - self.numtocat_validator(step=step) - self.catgroups_validator(step=step) - - def check_fit(self): - step = FIT_STEP - - self.method_validator(step=step) - self.visit_sequence_validator(step=step) - self.predictor_matrix_validator(step=step) - self.smoothing_validator(step=step) - - def check_generate(self): - step = GENERATE_STEP - - self.k_validator(step=step) - - def check_valid_type(self, attribute_name, return_type=False): - attribute_type = getattr(self.spop, attribute_name) - expected_types = self.attributes_types[attribute_name] - assert isinstance(attribute_type, expected_types) - - if return_type: - return attribute_type - - def method_validator(self, step=None): - if step == INIT_STEP: - # validate method type is allowed - method_type = self.check_valid_type('method', return_type=True) - print(method_type) - - if isinstance(method_type, str): - # if method type is str - # validate method is in allowed init methods - print(method_type) - assert self.spop.method in INIT_METHODS - - elif isinstance(method_type, list): - # if method type is list - # validate all methods are allowed - assert all(m in ALL_METHODS for m in self.spop.method) - - if step == PROCESSOR_STEP: - first_visited_col = self.spop.visit_sequence.index[self.spop.visit_sequence == 0].values[0] - - if self.spop.method is None: - # if method is not specified - # for each column set method to default method according to its dtype (method for first visited column is sample_method) - self.spop.method = [DEFAULT_METHODS_MAP[self.spop.default_method][self.spop.df_dtypes[col]] if col != first_visited_col else SAMPLE_METHOD - for col in self.spop.df_columns] - - elif isinstance(self.spop.method, str): - # if method type is str - # for each column set method to the corresponding allowed method according to its dtype (method for first visited column is sample_method) - self.spop.method = [INIT_METHODS_MAP[self.spop.method][self.spop.df_dtypes[col]] if col != first_visited_col else SAMPLE_METHOD - for col in self.spop.df_columns] - - else: - # validate method for first visited column with non empty method is sample_method - for col, visit_order in self.spop.visit_sequence.sort_values().items(): - col_method = self.spop.method[self.spop.df_columns.index(col)] - if col_method != EMPTY_METHOD: - assert col_method == SAMPLE_METHOD - break - # assert all(self.spop.method[i] == SAMPLE_METHOD for i, col in enumerate(self.spop.df_columns) if col == first_visited_col) - - # validate all columns have specified methods - assert len(self.spop.method) == self.spop.n_df_columns - self.spop.method = pd.Series(self.spop.method, index=self.spop.df_columns) - - if step == FIT_STEP: - for col in self.spop.method.index: - if col in self.spop.numtocat: - self.spop.method[col] = CONT_TO_CAT_METHODS_MAP[self.spop.method[col]] - - elif col in self.spop.processor.processing_dict[NAN_KEY] and self.spop.df_dtypes[col] in NUM_COLS_DTYPES and self.spop.method[col] in NA_METHODS: - # TODO put in a function - nan_col_index = self.spop.method.index.get_loc(col) - index_list = self.spop.method.index.tolist() - index_list.insert(nan_col_index, self.spop.processed_df_columns[nan_col_index]) - self.spop.method = self.spop.method.reindex(index_list, fill_value=CONT_TO_CAT_METHODS_MAP[self.spop.method[col]]) - - def visit_sequence_validator(self, step=None): - if step == INIT_STEP: - print('A') - # validate visit_sequence type is allowed - visit_sequence_type = self.check_valid_type('visit_sequence', return_type=True) - - if isinstance(visit_sequence_type, np.ndarray): - # if visit_sequence type is numpy array - # transform visit_sequence into a list - self.spop.visit_sequence = [col.item() for col in self.spop.visit_sequence] - visit_sequence_type = list - - if isinstance(visit_sequence_type, list): - # if visit_sequence type is list - # validate all visits are unique - assert len(set(self.spop.visit_sequence)) == len(self.spop.visit_sequence) - # validate all visits are either type int or type str - assert all(isinstance(col, int) for col in self.spop.visit_sequence) or all(isinstance(col, str) for col in self.spop.visit_sequence) - - if step == PROCESSOR_STEP: - print('TestX') - if self.spop.visit_sequence is None: - # if visit_sequence is not specified - # visit all columns in a row - self.spop.visit_sequence = [col.item() for col in np.arange(self.spop.n_df_columns)] - - if isinstance(self.spop.visit_sequence[0], int): - # if visit_sequence is list of column indices - # validate every index in visit_sequence is a valid column index - assert set(self.spop.visit_sequence).issubset(set(np.arange(self.spop.n_df_columns))) - # transform visit_sequence into a list of column names - self.spop.visit_sequence = [self.spop.df_columns[i] for i in self.spop.visit_sequence] - else: - # validate every column name in visit_sequence is a valid column name - assert set(self.spop.visit_sequence).issubset(set(self.spop.df_columns)) - - self.spop.visited_columns = [col for col in self.spop.df_columns if col in self.spop.visit_sequence] - self.spop.visit_sequence = pd.Series([self.spop.visit_sequence.index(col) for col in self.spop.visited_columns], index=self.spop.visited_columns) - - if step == FIT_STEP: - for col in self.spop.visit_sequence.index: - if col in self.spop.processor.processing_dict[NAN_KEY] and self.spop.df_dtypes[col] in NUM_COLS_DTYPES and self.spop.method[col] in NA_METHODS: - visit_step = self.spop.visit_sequence[col] - self.spop.visit_sequence.loc[self.spop.visit_sequence >= visit_step] += 1 - - nan_col_index = self.spop.visit_sequence.index.get_loc(col) - index_list = self.spop.visit_sequence.index.tolist() - index_list.insert(nan_col_index, self.spop.processed_df_columns[nan_col_index]) - self.spop.visit_sequence = self.spop.visit_sequence.reindex(index_list, fill_value=visit_step) - - def predictor_matrix_validator(self, step=None): - # if step == INIT_STEP: - # # validate predictor_matrix type is allowed - # self.check_valid_type('predictor_matrix') - - if step == PROCESSOR_STEP: - # build predictor_matrix so all previously visited columns are used for the prediction of the currently visited - self.spop.predictor_matrix = np.zeros([len(self.spop.visit_sequence), len(self.spop.visit_sequence)], dtype=int) - self.spop.predictor_matrix = pd.DataFrame(self.spop.predictor_matrix, index=self.spop.visit_sequence.index, columns=self.spop.visit_sequence.index) - visited_columns = [] - for col, _ in self.spop.visit_sequence.sort_values().items(): - self.spop.predictor_matrix.loc[col, visited_columns] = 1 - visited_columns.append(col) - - if step == FIT_STEP: - for col in self.spop.predictor_matrix: - if col in self.spop.processor.processing_dict[NAN_KEY] and self.spop.df_dtypes[col] in NUM_COLS_DTYPES and self.spop.method[col] in NA_METHODS: - nan_col_index = self.spop.predictor_matrix.columns.get_loc(col) - self.spop.predictor_matrix.insert(nan_col_index, self.spop.processed_df_columns[nan_col_index], self.spop.predictor_matrix[col]) - - index_list = self.spop.predictor_matrix.index.tolist() - index_list.insert(nan_col_index, self.spop.processed_df_columns[nan_col_index]) - self.spop.predictor_matrix = self.spop.predictor_matrix.reindex(index_list, fill_value=0) - self.spop.predictor_matrix.loc[self.spop.processed_df_columns[nan_col_index]] = self.spop.predictor_matrix.loc[col] - - self.spop.predictor_matrix.loc[col, self.spop.processed_df_columns[nan_col_index]] = 1 - - def proper_validator(self, step=None): - if step == INIT_STEP: - # validate proper type is allowed - self.check_valid_type('proper') - - def cont_na_validator(self, step=None): - if step == INIT_STEP: - # validate cont_na type is allowed - self.check_valid_type('cont_na') - - if step == PROCESSOR_STEP: - if self.spop.cont_na is None: - self.spop.cont_na = {} - else: - # validate columns in cont_na are valid columns - assert all(col in self.spop.df_columns for col in self.spop.cont_na) - # assert all(col in self.spop.visited_columns for col in self.spop.cont_na) - # validate the type of columns in cont_na are valid types - assert all(self.spop.df_dtypes[col] in NUM_COLS_DTYPES for col in self.spop.cont_na) - self.spop.cont_na = {col: col_cont_na for col, col_cont_na in self.spop.cont_na.items() if self.spop.method[col] in NA_METHODS} - - def smoothing_validator(self, step=None): - if step == INIT_STEP: - # validate smoothing type is allowed - self.check_valid_type('smoothing') - - if step == PROCESSOR_STEP: - if self.spop.smoothing is False: - self.spop.smoothing = {col: False for col in self.spop.df_columns} - elif isinstance(self.spop.smoothing, str): - # if smoothing type is str - # validate smoothing is 'density' - assert self.spop.smoothing == DENSITY - self.spop.smoothing = {col: self.spop.df_dtypes[col] in NUM_COLS_DTYPES for col in self.spop.df_columns} - else: - # validate smoothing is 'denisty' for some/all numerical columns and False for all other columns - assert all((smoothing_method == DENSITY and self.spop.df_dtypes[col] in NUM_COLS_DTYPES) or smoothing_method is False - for col, smoothing_method in self.spop.smoothing.items()) - self.spop.smoothing = {col: (self. spop.smoothing.get(col, False) == DENSITY and self.spop.df_dtypes[col] in NUM_COLS_DTYPES) for col in self.spop.df_columns} - - if step == FIT_STEP: - for col in self.spop.processed_df_columns: - if col in self.spop.numtocat: - self.spop.smoothing[col] = False - elif col in self.spop.processor.processing_dict[NAN_KEY] and self.spop.df_dtypes[col] in NUM_COLS_DTYPES: - self.spop.smoothing[self.spop.processor.processing_dict[NAN_KEY][col]['col_nan_name']] = False - - def default_method_validator(self, step=None): - if step == INIT_STEP: - # validate default_method type is allowed - self.check_valid_type('default_method') - - # validate default_method is in allowed default methods - assert self.spop.default_method in DEFAULT_METHODS - - def numtocat_validator(self, step=None): - if step == INIT_STEP: - # validate numtocat type is allowed - self.check_valid_type('numtocat') - - if step == PROCESSOR_STEP: - if self.spop.numtocat is None: - self.spop.numtocat = [] - else: - # validate all columns in numtocat are valid columns - assert all(col in self.spop.df_columns for col in self.spop.numtocat) - # assert all(col in self.spop.visited_columns for col in self.spop.numtocat) - # validate all columns in numtocat are numerical columns - assert all(self.spop.df_dtypes[col] in NUM_COLS_DTYPES for col in self.spop.numtocat) - - def catgroups_validator(self, step=None): - if step == INIT_STEP: - # validate catgroups type is allowed - catgroups_type = self.check_valid_type('catgroups', return_type=True) - - if isinstance(catgroups_type, int): - # if catgroups type is int - # validate catgroups is more than 1 - assert self.spop.catgroups > 1 - - elif isinstance(catgroups_type, dict): - # if catgroups type is dict - # validate the keys in catgroups are the same as the columns in numtocat - assert set(self.spop.catgroups.keys()) == set(self.spop.numtocat) - # validate all values in catgroups are type int and more than 1 - assert all((isinstance(col_groups, int) and col_groups > 1) for col_groups in self.spop.catgroups.values()) - - if step == PROCESSOR_STEP: - if self.spop.catgroups is None: - self.spop.catgroups = {col: 5 for col in self.spop.numtocat} - elif isinstance(self.spop.catgroups, int): - self.spop.catgroups = {col: self.spop.catgroups for col in self.spop.numtocat} - - def seed_validator(self, step=None): - if step == INIT_STEP: - # validate seed type is allowed - self.check_valid_type('seed') - - def k_validator(self, step=None): - if step == GENERATE_STEP: - # validate k type is allowed - self.check_valid_type('k') - - if self.spop.k is None: - self.spop.k = self.spop.n_df_rows + def __init__(self) -> None: + pass \ No newline at end of file diff --git a/tests/test_synthpop.py b/tests/test_synthpop.py index 4604040..1fc12e5 100644 --- a/tests/test_synthpop.py +++ b/tests/test_synthpop.py @@ -1,103 +1,215 @@ -import pytest +# test_synthpop.py + +import unittest +import numpy as np import pandas as pd -from synthpop import Synthpop -from datasets.adult import df, dtypes +from synthpop.metrics import MetricsReport, EfficacyMetrics, DisclosureProtection +from synthpop.processor.data_processor import DataProcessor, InvalidDataError +from synthpop.processor.missing_data_handler import MissingDataHandler +from synthpop.method.GC import GaussianCopulaMethod -def test_synthpop_default_parameters(): - """Test Synthpop with default parameters and automatic type inference.""" - # Initialize Synthpop - spop = Synthpop() - - # Fit the model with automatic type inference - spop.fit(df) - - # Generate synthetic data - synth_df = spop.generate(len(df)) - - # Verify the synthetic dataframe has the same shape as original - assert synth_df.shape == df.shape - - # Verify the synthetic dataframe has the same columns as original - assert all(synth_df.columns == df.columns) - - # Verify inferred dtypes match expected types - assert spop.df_dtypes['age'] == 'int' - assert spop.df_dtypes['workclass'] == 'category' - assert spop.df_dtypes['education'] == 'category' - - # Verify the method attribute contains expected default values - assert isinstance(spop.method, pd.Series) - assert 'age' in spop.method.index - assert spop.method['age'] == 'sample' # age should use sample method - assert all(spop.method[spop.method != 'sample'] == 'cart') # rest should use cart - - # Verify visit sequence is properly set - assert isinstance(spop.visit_sequence, pd.Series) - assert len(spop.visit_sequence) == len(df.columns) - assert all(spop.visit_sequence.index == df.columns) - - # Verify predictor matrix is properly set - assert isinstance(spop.predictor_matrix, pd.DataFrame) - assert spop.predictor_matrix.shape == (len(df.columns), len(df.columns)) - assert all(spop.predictor_matrix.index == df.columns) - assert all(spop.predictor_matrix.columns == df.columns) +# ------------------------------- +# Tests for MetricsReport +# ------------------------------- +class TestMetricsReport(unittest.TestCase): + def setUp(self): + # Create sample real and synthetic data with various types. + self.real_df = pd.DataFrame({ + "numeric_col": [1, 2, 3, 4, 5, np.nan], + "categorical_col": ["a", "b", "a", "c", "b", "b"], + "datetime_col": pd.date_range("2023-01-01", periods=6), + "boolean_col": [True, False, True, False, True, False] + }) + self.synthetic_df = pd.DataFrame({ + "numeric_col": [1.1, 2.1, 3.1, 4.0, 5.2, np.nan], + "categorical_col": ["a", "b", "b", "c", "d", "b"], + "datetime_col": pd.date_range("2023-01-01", periods=6), + "boolean_col": [True, True, True, False, True, False] + }) + self.metadata = { + "numeric_col": "numerical", + "categorical_col": "categorical", + "datetime_col": "datetime", + "boolean_col": "boolean" + } + + def test_generate_report(self): + report = MetricsReport(self.real_df, self.synthetic_df, self.metadata) + report_df = report.generate_report() + self.assertIsInstance(report_df, pd.DataFrame) + expected_cols = {"column", "type", "missing_value_similarity", "range_coverage", + "boundary_adherence", "ks_complement", "tv_complement", + "statistic_similarity", "category_coverage", "category_adherence"} + self.assertTrue(expected_cols.issubset(set(report_df.columns))) + # Check that non-applicable metrics are marked as "N/A" + num_report = report_df[report_df["type"]=="numerical"].iloc[0] + self.assertEqual(num_report["category_coverage"], "N/A") + cat_report = report_df[report_df["type"]=="categorical"].iloc[0] + self.assertEqual(cat_report["range_coverage"], "N/A") -def test_synthpop_with_manual_dtypes(): - """Test Synthpop with manually specified dtypes.""" - # Initialize Synthpop - spop = Synthpop() - - # Fit the model with explicit dtypes - spop.fit(df, dtypes) - - # Verify the dtypes were set correctly - for col, dtype in dtypes.items(): - assert spop.df_dtypes[col] == dtype - - # Generate synthetic data - synth_df = spop.generate(len(df)) - - # Verify the synthetic dataframe has the same shape and columns - assert synth_df.shape == df.shape - assert all(synth_df.columns == df.columns) +# ------------------------------- +# Tests for EfficacyMetrics +# ------------------------------- +class TestEfficacyMetrics(unittest.TestCase): + def test_regression(self): + np.random.seed(42) + real_reg = pd.DataFrame({ + "feat1": np.random.normal(0, 1, 100), + "feat2": np.random.normal(5, 2, 100), + "target": np.random.normal(10, 3, 100) + }) + synthetic_reg = pd.DataFrame({ + "feat1": np.random.normal(0, 1, 100), + "feat2": np.random.normal(5, 2, 100), + "target": np.random.normal(10, 3, 100) + }) + efficacy_reg = EfficacyMetrics(task='regression', target_column="target", random_state=42) + metrics = efficacy_reg.evaluate(real_reg, synthetic_reg) + self.assertIn("mse", metrics) + self.assertIn("mae", metrics) + self.assertIn("r2", metrics) + self.assertLessEqual(metrics["r2"], 1.0) -def test_synthpop_custom_visit_sequence(): - """Test Synthpop with custom visit sequence using Adult dataset.""" - # Define custom visit sequence - visit_sequence = [0, 1, 5, 3, 2] - - # Initialize Synthpop with custom visit sequence - spop = Synthpop(visit_sequence=visit_sequence) - - # Fit the model with automatic type inference - spop.fit(df) - - # Generate synthetic data - synth_df = spop.generate(len(df)) - - # Verify only specified columns were synthesized - expected_columns = ['age', 'workclass', 'marital.status', 'education', 'fnlwgt'] - assert len(synth_df.columns) == len(expected_columns) - assert all(col in synth_df.columns for col in expected_columns) - - # Verify visit sequence matches what was specified - assert len(spop.visit_sequence) == len(visit_sequence) - assert spop.visit_sequence['age'] == 0 - assert spop.visit_sequence['workclass'] == 1 - assert spop.visit_sequence['marital.status'] == 2 - assert spop.visit_sequence['education'] == 3 - assert spop.visit_sequence['fnlwgt'] == 4 - - # Verify predictor matrix has correct shape for subset of columns - assert spop.predictor_matrix.shape == (len(expected_columns), len(expected_columns)) - assert all(col in spop.predictor_matrix.columns for col in expected_columns) - assert all(col in spop.predictor_matrix.index for col in expected_columns) - - # Verify specific predictor relationships from example - pred_matrix = spop.predictor_matrix - assert pred_matrix.loc['age', 'age'] == 0 - assert pred_matrix.loc['workclass', 'age'] == 1 - assert pred_matrix.loc['workclass', 'workclass'] == 0 - assert pred_matrix.loc['fnlwgt', ['age', 'workclass', 'education', 'marital.status']].sum() == 4 - assert pred_matrix.loc['education', ['age', 'workclass', 'marital.status']].sum() == 3 - assert pred_matrix.loc['marital.status', ['age', 'workclass']].sum() == 2 + def test_classification(self): + np.random.seed(42) + real_clf = pd.DataFrame({ + "feat1": np.random.normal(0, 1, 100), + "feat2": np.random.normal(5, 2, 100), + "target": np.random.choice(["A", "B"], size=100) + }) + synthetic_clf = pd.DataFrame({ + "feat1": np.random.normal(0, 1, 100), + "feat2": np.random.normal(5, 2, 100), + "target": np.random.choice(["A", "B"], size=100) + }) + efficacy_clf = EfficacyMetrics(task='classification', target_column="target", random_state=42) + metrics = efficacy_clf.evaluate(real_clf, synthetic_clf) + self.assertIn("accuracy", metrics) + self.assertIn("f1_score", metrics) + +# ------------------------------- +# Tests for DisclosureProtection +# ------------------------------- +class TestDisclosureProtection(unittest.TestCase): + def test_score_and_report(self): + np.random.seed(42) + real_dp = pd.DataFrame({ + "f1": np.random.normal(0, 1, 100), + "f2": np.random.normal(5, 2, 100) + }) + # Create synthetic data by adding small noise + synthetic_dp = real_dp + np.random.normal(0, 0.5, real_dp.shape) + dp = DisclosureProtection(real_dp, synthetic_dp) + score = dp.score() + report = dp.report() + self.assertIsInstance(score, float) + self.assertIsInstance(report, dict) + self.assertIn("threshold", report) + self.assertIn("risk_rate", report) + self.assertIn("disclosure_protection_score", report) + +# ------------------------------- +# Tests for DataProcessor +# ------------------------------- +class TestDataProcessor(unittest.TestCase): + def setUp(self): + # Create a DataFrame with different types. + self.df = pd.DataFrame({ + "numeric": np.random.normal(10, 2, 50), + "categorical": np.random.choice(["Red", "Green", "Blue"], 50), + "boolean": np.random.choice([True, False], 50), + "datetime": pd.date_range("2023-01-01", periods=50), + "timedelta": pd.to_timedelta(np.random.randint(1, 100, 50), unit="D"), + "float": np.random.uniform(0, 1, 50) + }) + self.metadata = { + "numeric": "numerical", + "categorical": "categorical", + "boolean": "boolean", + "datetime": "datetime", + "timedelta": "timedelta", + "float": "numerical" + } + self.processor = DataProcessor(self.metadata) + + def test_preprocess_postprocess(self): + # Preprocess the data + processed = self.processor.preprocess(self.df) + self.assertIsInstance(processed, pd.DataFrame) + # Check that categorical columns are encoded (i.e. no string values remain) + for col, dtype in self.metadata.items(): + if dtype == "categorical": + self.assertTrue(np.issubdtype(processed[col].dtype, np.number) or col not in processed.columns) + # Simulate synthetic data as processed copy then postprocess back + synthetic_processed = processed.copy() + recovered = self.processor.postprocess(synthetic_processed) + self.assertIsInstance(recovered, pd.DataFrame) + # Check that the recovered DataFrame has the original columns order. + self.assertListEqual(list(recovered.columns), list(self.df.columns)) + + def test_validate_raises_on_missing_column(self): + # Remove one column so that validation should fail. + df_missing = self.df.drop(columns=["numeric"]) + with self.assertRaises(InvalidDataError): + self.processor.validate(df_missing) + +# ------------------------------- +# Tests for MissingDataHandler +# ------------------------------- +class TestMissingDataHandler(unittest.TestCase): + def setUp(self): + # Create a DataFrame with missing values in different types. + self.df = pd.DataFrame({ + "num": [1, 2, np.nan, 4, 5], + "cat": ["a", np.nan, "b", "a", "c"], + "bool": [True, False, np.nan, True, False], + "datetime": pd.to_datetime(["2023-01-01", np.nan, "2023-01-03", "2023-01-04", "2023-01-05"]), + "timedelta": pd.to_timedelta([1, 2, np.nan, 4, 5], unit="D") + }) + self.handler = MissingDataHandler() + + + def test_apply_imputation(self): + # First, detect missingness; we won't get perfect detection, but just test that imputation runs. + missingness = self.handler.detect_missingness(self.df) + imputed = self.handler.apply_imputation(self.df, missingness) + # Check that after imputation there are no missing values. + self.assertFalse(imputed.isna().any().any()) + +# ------------------------------- +# Tests for GaussianCopulaMethod +# ------------------------------- +class TestGaussianCopulaMethod(unittest.TestCase): + def setUp(self): + # Create a simple DataFrame with numerical and categorical columns. + self.df = pd.DataFrame({ + "numeric": np.random.normal(50, 10, 100), + "categorical": np.random.choice(["Red", "Green", "Blue"], 100) + }) + self.metadata = { + "numeric": "numerical", + "categorical": "categorical" + } + # For simplicity, we use the DataProcessor to convert data to numeric space. + self.processor = DataProcessor(self.metadata) + self.processed = self.processor.preprocess(self.df) + self.gc = GaussianCopulaMethod(self.metadata) + self.gc.fit(self.processed) + + def test_sample_shape(self): + # Use the sample method with a requested number of rows. + num_samples = 50 + synthetic = self.gc.sample(num_samples) + self.assertIsInstance(synthetic, pd.DataFrame) + self.assertEqual(len(synthetic), num_samples) + + def test_get_learned_distributions(self): + # After fitting, learned distributions should be available. + distributions = self.gc.get_learned_distributions() + self.assertIsInstance(distributions, dict) + # Check that keys correspond to columns in metadata. + for col in self.metadata.keys(): + self.assertIn(col, distributions) + +if __name__ == "__main__": + unittest.main() diff --git a/tests_processing.py b/tests_processing.py deleted file mode 100644 index a01f5eb..0000000 --- a/tests_processing.py +++ /dev/null @@ -1,44 +0,0 @@ -import unittest -from synthpop import Synthpop -import pandas as pd -import numpy as np - -class TestProcessing(unittest.TestCase): - - def test_add_NaN_columns_for_numeric_columns(self): - df = pd.DataFrame({'a':[1,2,np.nan], 'b':[1,1,1], 'c':['x','y',None]}) - spop = Synthpop() - dtype_map = {'a':'float','b':'float', 'c':'categorical'} - res,dtype_res = spop.pre_preprocess(df,dtype_map,nan_fill=-8) - - self.assertTrue('a_NaN' in res,"Nan column not made") - self.assertFalse('b_NaN' in res,"Nan column should not be made if there are no NaNs") - self.assertFalse('c_NaN' in res,"Nan column should not be made for categorical columns") - self.assertTrue(res['a_NaN'][2]) - self.assertEqual(res['a'][2], -8) - self.assertEqual(dtype_res['a_NaN'],'category') - self.assertEqual(spop.map_column_to_NaN_column['a'],'a_NaN') - def test_make_visit_sequence_when_one_is_given(self): - - visit_seq = ['x','a','b'] - spop = Synthpop(visit_sequence=visit_seq) - spop.map_column_to_NaN_column = {'a':'a_NaN','c':'c_NaN'} - - spop.include_nan_columns() - - self.assertSequenceEqual(spop.visit_sequence,['x','a_NaN','a','b']) - - - def test_apply_and_remove_added_NaN_columns(self): - df = pd.DataFrame({'a':[1,2,-8],'a_NaN':[False,True,False], 'b':[1,1,1], 'c':['x','y',None]}) - - spop = Synthpop() - spop.map_column_to_NaN_column = {'a':'a_NaN'} - - res = spop.post_postprocessing(df) - self.assertTrue(np.isnan(res['a'][1]), "NaNs should be placed where indicated") - self.assertFalse('a_NaN' in res, "indicator columns should be removed") - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file