Skip to content

Commit

Permalink
feat!: migrate to use microgen (#34)
Browse files Browse the repository at this point in the history
* feat!: migrate to use microgen

* Update UPGRADING.md

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>
  • Loading branch information
arithmetic1728 and busunkim96 committed Aug 18, 2020
1 parent 500d18e commit 88bb66c
Show file tree
Hide file tree
Showing 18 changed files with 377 additions and 521 deletions.
98 changes: 49 additions & 49 deletions dlp/snippets/custom_infotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@

# [START dlp_omit_name_if_also_email]
def omit_name_if_also_email(
project,
content_string,
project, content_string,
):
"""Marches PERSON_NAME and EMAIL_ADDRESS, but not both.
Expand Down Expand Up @@ -51,33 +50,34 @@ def omit_name_if_also_email(
# the total number of findings when there is a large overlap between different
# infoTypes.
inspect_config = {
"info_types":
info_types_to_locate,
"rule_set": [{
"info_types": [{
"name": "PERSON_NAME"
}],
"rules": [{
"exclusion_rule": {
"exclude_info_types": {
"info_types": [{
"name": "EMAIL_ADDRESS"
}]
},
"matching_type": "MATCHING_TYPE_PARTIAL_MATCH"
}
}]
}]
"info_types": info_types_to_locate,
"rule_set": [
{
"info_types": [{"name": "PERSON_NAME"}],
"rules": [
{
"exclusion_rule": {
"exclude_info_types": {
"info_types": [{"name": "EMAIL_ADDRESS"}]
},
"matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH,
}
}
],
}
],
}

# Construct the `item`.
item = {"value": content_string}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)
parent = f"projects/{project}"

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)
response = dlp.inspect_content(
request={"parent": parent, "inspect_config": inspect_config, "item": item}
)

return [f.info_type.name for f in response.result.findings]

Expand All @@ -87,9 +87,7 @@ def omit_name_if_also_email(

# [START inspect_with_person_name_w_custom_hotword]
def inspect_with_person_name_w_custom_hotword(
project,
content_string,
custom_hotword="patient"
project, content_string, custom_hotword="patient"
):
"""Uses the Data Loss Prevention API increase likelihood for matches on
PERSON_NAME if the user specified custom hotword is present. Only
Expand All @@ -114,7 +112,9 @@ def inspect_with_person_name_w_custom_hotword(
# window preceding the PII finding.
hotword_rule = {
"hotword_regex": {"pattern": custom_hotword},
"likelihood_adjustment": {"fixed_likelihood": "VERY_LIKELY"},
"likelihood_adjustment": {
"fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY
},
"proximity": {"window_before": 50},
}

Expand All @@ -128,17 +128,19 @@ def inspect_with_person_name_w_custom_hotword(
# Construct the configuration dictionary with the custom regex info type.
inspect_config = {
"rule_set": rule_set,
"min_likelihood": "VERY_LIKELY",
"min_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY,
}

# Construct the `item`.
item = {"value": content_string}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)
parent = f"projects/{project}"

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)
response = dlp.inspect_content(
request={"parent": parent, "inspect_config": inspect_config, "item": item}
)

# Print out the results.
if response.result.findings:
Expand All @@ -153,13 +155,13 @@ def inspect_with_person_name_w_custom_hotword(
else:
print("No findings.")


# [END inspect_with_person_name_w_custom_hotword]


# [START dlp_inspect_with_medical_record_number_custom_regex_detector]
def inspect_with_medical_record_number_custom_regex_detector(
project,
content_string,
project, content_string,
):
"""Uses the Data Loss Prevention API to analyze string with medical record
number custom regex detector
Expand All @@ -183,7 +185,7 @@ def inspect_with_medical_record_number_custom_regex_detector(
{
"info_type": {"name": "C_MRN"},
"regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"},
"likelihood": "POSSIBLE",
"likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE,
}
]

Expand All @@ -196,10 +198,12 @@ def inspect_with_medical_record_number_custom_regex_detector(
item = {"value": content_string}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)
parent = f"projects/{project}"

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)
response = dlp.inspect_content(
request={"parent": parent, "inspect_config": inspect_config, "item": item}
)

# Print out the results.
if response.result.findings:
Expand All @@ -214,13 +218,13 @@ def inspect_with_medical_record_number_custom_regex_detector(
else:
print("No findings.")


# [END dlp_inspect_with_medical_record_number_custom_regex_detector]


# [START dlp_inspect_with_medical_record_number_w_custom_hotwords]
def inspect_with_medical_record_number_w_custom_hotwords(
project,
content_string,
project, content_string,
):
"""Uses the Data Loss Prevention API to analyze string with medical record
number custom regex detector, with custom hotwords rules to boost finding
Expand All @@ -245,30 +249,23 @@ def inspect_with_medical_record_number_w_custom_hotwords(
{
"info_type": {"name": "C_MRN"},
"regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"},
"likelihood": "POSSIBLE",
"likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE,
}
]

# Construct a rule set with hotwords "mrn" and "medical", with a likelohood
# boost to VERY_LIKELY when hotwords are present within the 10 character-
# window preceding the PII finding.
hotword_rule = {
"hotword_regex": {
"pattern": "(?i)(mrn|medical)(?-i)"
},
"hotword_regex": {"pattern": "(?i)(mrn|medical)(?-i)"},
"likelihood_adjustment": {
"fixed_likelihood": "VERY_LIKELY"
"fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY
},
"proximity": {
"window_before": 10
}
"proximity": {"window_before": 10},
}

rule_set = [
{
"info_types": [{"name": "C_MRN"}],
"rules": [{"hotword_rule": hotword_rule}],
}
{"info_types": [{"name": "C_MRN"}], "rules": [{"hotword_rule": hotword_rule}]}
]

# Construct the configuration dictionary with the custom regex info type.
Expand All @@ -281,10 +278,12 @@ def inspect_with_medical_record_number_w_custom_hotwords(
item = {"value": content_string}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)
parent = f"projects/{project}"

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)
response = dlp.inspect_content(
request={"parent": parent, "inspect_config": inspect_config, "item": item}
)

# Print out the results.
if response.result.findings:
Expand All @@ -299,4 +298,5 @@ def inspect_with_medical_record_number_w_custom_hotwords(
else:
print("No findings.")


# [END dlp_inspect_with_medical_record_number_w_custom_hotwords]
21 changes: 12 additions & 9 deletions dlp/snippets/custom_infotype_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

def test_omit_name_if_also_email(capsys):
info_types = custom_infotype.omit_name_if_also_email(
GCLOUD_PROJECT, "alice@example.com")
GCLOUD_PROJECT, "alice@example.com"
)

# Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME.
assert len(info_types) == 1
Expand All @@ -30,7 +31,8 @@ def test_omit_name_if_also_email(capsys):

def test_inspect_with_person_name_w_custom_hotword(capsys):
custom_infotype.inspect_with_person_name_w_custom_hotword(
GCLOUD_PROJECT, "patient's name is John Doe.", "patient")
GCLOUD_PROJECT, "patient's name is John Doe.", "patient"
)

out, _ = capsys.readouterr()
assert "Info type: PERSON_NAME" in out
Expand All @@ -39,26 +41,27 @@ def test_inspect_with_person_name_w_custom_hotword(capsys):

def test_inspect_with_medical_record_number_custom_regex_detector(capsys):
custom_infotype.inspect_with_medical_record_number_custom_regex_detector(
GCLOUD_PROJECT, "Patients MRN 444-5-22222")
GCLOUD_PROJECT, "Patients MRN 444-5-22222"
)

out, _ = capsys.readouterr()
assert "Info type: C_MRN" in out


def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords(
capsys):
def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords(capsys):
custom_infotype.inspect_with_medical_record_number_w_custom_hotwords(
GCLOUD_PROJECT, "just a number 444-5-22222")
GCLOUD_PROJECT, "just a number 444-5-22222"
)

out, _ = capsys.readouterr()
assert "Info type: C_MRN" in out
assert "Likelihood: 3" in out


def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords(
capsys):
def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords(capsys):
custom_infotype.inspect_with_medical_record_number_w_custom_hotwords(
GCLOUD_PROJECT, "Patients MRN 444-5-22222")
GCLOUD_PROJECT, "Patients MRN 444-5-22222"
)

out, _ = capsys.readouterr()
assert "Info type: C_MRN" in out
Expand Down
Loading

0 comments on commit 88bb66c

Please sign in to comment.