# record surpression
**description:** record surpression refers to the removal of an entire record in the dataset. in contrast to most techniques, this technique affects multiple attributes at the same time

**how to use it:** delete an entire record. "Redacting" may not be sufficient if the underlying data remains accessible

In [1]:
from docx import Document
import re

regex={
    "email": "\S+@(\S+|\.\S+)",
    "age": "\d+\syears\sold",
    "money": "(?!\s|,)\W\d+(,\d+)+",
    "phone number": "(\+65|\+65\s)?(\d{4}\d{4}|\d{4}\s\d{4})",
    "name or place": "([A-Z][a-z]+\s?)"
}
sensitive_types=[]

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    for i in regex:
        if re.search(regex[i], sentence):
            sensitive=True
            sensitive_types.append(i)
    return sensitive

def sanitize(sentence, sensitive_types:list):
    for i in sensitive_types:
        sentence=re.sub(regex[i], "", sentence)
    return sentence

# open document
doc = Document("test.docx")

# sanitize
sanitized_doc=Document()
output_paragraph=""

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    sentences=re.split("\.\s", original_text)

    for sentence in sentences:
        if is_sensitive(sentence):
            output_paragraph+=sanitize(sentence, sensitive_types)+". "
        else:
            output_paragraph+=sentence
        sensitive_types=[]

    sanitized_doc.add_paragraph(output_paragraph)
    output_paragraph=""
    

# output
sanitized_doc.save("surpressed.docx")
print("Done! check save folder")

Done! check save folder


# generalisation
**description:** a reduction in the precision of data by rephrasing into something more vague. for example turning age into a age range or a precise location into a district/country.

**how to use it:** design appropriate data categories and rules for translating data and surpress records that still stand out after translation

In [9]:
from docx import Document
import re

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    type=False
    if re.search("([A-Z]\w+\s|[A-Z]\w+)+", sentence): # name or place
        sensitive=True
        type="name or place"
    elif re.search("\W\d+((\W|\W\s)\d+)+", sentence): # money
        sensitive=True
        type="money"
    elif re.search("\d+\s\S+\sold", sentence): # age
        sensitive=True
        type="age"
    return sensitive, type

# open document
doc = Document("test.docx")

# sanitize
sanitized_doc=Document()

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    if is_sensitive(original_text)[0]==True:
        if is_sensitive(original_text)[1]=="name or place":
            substituted_text=re.sub("([A-Z]\w+\s|[A-Z]\w+)+", "around place", original_text)
            sanitized_doc.add_paragraph(substituted_text)
        
        if is_sensitive(original_text)[1]=="money":
            number=re.search("\W\d+((\W|\W\s)\d+)+", original_text)
            number=re.sub("(\.|,\s|,|\s)", "", number.group)
            number_range=str(float(number)-100)+" - "+str(float(number)+100)
            substituted_text=re.sub("\W\d+((\W|\W\s)\d+)+", number_range, original_text)
            sanitized_doc.add_paragraph(substituted_text)
        
        if is_sensitive(original_text)[1]=="age":
            number=re.search("\d+\s\S+\sold", original_text)
            number=re.search("\d+", number.group)
            number_range=str(float(number)-10)+" - "+str(float(number)+10)
            substituted_text=re.sub("\d+\s\S+\sold", number_range, original_text)
            sanitized_doc.add_paragraph(substituted_text)
    else:
        sanitized_doc.add_paragraph(original_text)

# output
sanitized_doc.save("generalisation.docx")
print("Done! check save folder")

Done! check save folder


# data aggregation
**description:** converting a dataset from a list of records to summarised values

**how to use it:** statistical measures can be used. typical ways include using totals or averages, etc.

In [None]:
# program here