# all technique source code
### technique authorship
*Ayrton's techniques*
1. character masking
2. synthetic data
3. data perturbation  

*Yi Thng's techniques*
1. record surpression
2. generalisation
3. data aggregation

*Ethan's techniques*
1. pseudonymisation
2. swapping
3. data aggregation

## character masking
**description:**  
**implementation:**

In [None]:
from docx import Document

def mask_data(text):
    masked_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Mask names
            masked_text += mask_name(word) + ' '
        elif is_email(word):  # Mask email addresses
            masked_text += mask_email(word) + ' '
        elif is_number(word):  # Mask numbers (including phone numbers)
            masked_text += mask_number(word) + ' '
        else:
            masked_text += word + ' '
    return masked_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def mask_name(name):
    # Mask a name by replacing characters with 'X'
    masked_name = ''
    for char in name:
        if char.isalnum():
            masked_name += 'X'
        else:
            masked_name += char
    return masked_name

def mask_email(email):
    # Mask an email address by replacing the entire email address with 'X'
    return 'X' * len(email)

def mask_number(number):
    # Mask a number (including phone numbers) by replacing digits with 'X'
    return 'X' * len(number)

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    masked_text = mask_data(original_text)
    paragraph.text = masked_text

# Save the modified document
doc.save('masked.docx')
print("Done! Check save folder")

## synthetic data
**description:**  
**implementation:**

In [None]:
from faker import Faker
from docx import Document

faker = Faker()

# Example original document
document = Document('test.docx')

# Function to sanitize text
def sanitize_text(text):
    # Split the text into words
    words = text.split()
    # Generate a fake word for each original word
    fake_words = [faker.word() for _ in words]
    # Combine the fake words into a string
    fake_text = ' '.join(fake_words)
    return fake_text

# Sanitize the document
for paragraph in document.paragraphs:
    # Sanitize the text in the paragraph
    sanitized_text = sanitize_text(paragraph.text)
    # Replace the original text with the sanitized text
    paragraph.text = sanitized_text

# Save the sanitized document
document.save('synthetic.docx')
print("Done, Check save folder")

## data perturbation
**description:**  
**implementation:**

In [None]:
from docx import Document
import random
import string

def perturb_data(text):
    perturbed_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Perturb names
            perturbed_text += perturb_name(word) + ' '
        elif is_email(word):  # Perturb email addresses
            perturbed_text += perturb_email(word) + ' '
        elif is_number(word):  # Perturb numbers (including phone numbers)
            perturbed_text += perturb_number(word) + ' '
        else:
            perturbed_text += word + ' '
    return perturbed_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word and word.index('@') < word.rindex('.')

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def perturb_name(name):
    # Perturb a name by randomly replacing characters
    perturbed_name = ''
    for char in name:
        if char.isalnum():
            perturbed_name += random.choice(string.ascii_letters)
        else:
            perturbed_name += char
    return perturbed_name

def perturb_email(email):
    # Perturb an email address by randomly changing characters before the '@' symbol
    username, domain = email.split('@')
    perturbed_username = perturb_name(username)
    return perturbed_username + '@' + domain

def perturb_number(number):
    # Perturb a number (including phone numbers) by randomly changing digits
    perturbed_number = ''
    for digit in number:
        if digit.isdigit():
            perturbed_number += random.choice(string.digits)
        else:
            perturbed_number += digit
    return perturbed_number

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    perturbed_text = perturb_data(original_text)
    paragraph.text = perturbed_text

# Save the modified document
doc.save('perturbed.docx')
print("Done! Check save folder")

## record surpression
**description:** record surpression refers to the removal of an entire record in the dataset. in contrast to most techniques, this technique affects multiple attributes at the same time  
**implementation:** delete an entire record. "Redacting" may not be sufficient if the underlying data remains accessible

In [None]:
from docx import Document
import re

regex={
    "email": "\S+@(\S+|\.\S+)",
    "age": "\d+\syears\sold",
    "money": "\$\d+(?:\,\d+|\d+)+(?:\.\d+)?",
    "phone number": "(?:\+65|\+65\s)?\d{4}\s?\d{4}",
    "name or place": "[A-Z][a-z]+\s?"
}

poor_punctuation="^(?:[^\w\s]+|\s+)+$|(?:[^\w\s](?:\s+)?){2,}|[^\w\s]{2,}|\s{2,}"
sanitization_whitespace_remnant="\s\."

sensitive_types=[]

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    for key in regex:
        if re.search(regex[key], sentence):
            sensitive=True
            sensitive_types.append(key)
    return sensitive

def sanitize(sentence, sensitive_types:list):
    for i in sensitive_types:
        sentence=re.sub(regex[i], "", sentence)
    return sentence

# sanitize
doc = Document("test.docx")
sanitized_doc=Document()
output_paragraph=""

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    sentences=re.split("\.\s", original_text)
    for sentence in sentences:
        if is_sensitive(sentence):
            output_paragraph+=sanitize(sentence, sensitive_types)+". "
        else:
            output_paragraph+=sentence
        sensitive_types=[]

    output_paragraph=re.sub(poor_punctuation, "", output_paragraph)
    output_paragraph=re.sub(sanitization_whitespace_remnant, ".", output_paragraph)
    if len(output_paragraph)>0:
        sanitized_doc.add_paragraph(output_paragraph)
    output_paragraph=""

# output
sanitized_doc.save("surpressed.docx")
print("Done! check save folder")

## generalisation
**description:** a reduction in the precision of data by rephrasing into something more vague. for example turning age into a age range or a precise location into a district/country.  
**implementation:** design appropriate data categories and rules for translating data and surpress records that still stand out after translation

In [24]:
from docx import Document
import re

regex={
    "age": "\d+\syears\sold",
    "money": "\$\d+(?:\,\d+|\d+)+(?:\.\d+)?",
    "name or place": "[A-Z][a-z]+\s?"
}

poor_punctuation="^(?:[^\w\s]+|\s+)+$|(?:[^\w\s](?:\s+)?){2,}|[^\w\s]{2,}|\s{2,}"
sanitization_whitespace_remnant="\s\."

sensitive_types=[]

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    for key in regex:
        if re.search(regex[key], sentence):
            sensitive=True
            sensitive_types.append(key)
    return sensitive

def sanitize(sentence, sensitive_types:list):
    if "age" in sensitive_types:
        all_instances=re.findall(regex["age"], sentence)
        for matched in all_instances:
            age=re.search("\d+", matched)
            age=int(age.group())
            sentence=re.sub(re.escape(matched), str(age-10)+" - "+str(age+10), sentence)

    if "money" in sensitive_types:
        all_instances=re.findall(regex["money"], sentence)
        for matched in all_instances:
            money=float(re.sub("\$|,", "", matched))
            sentence=re.sub(re.escape(matched), str(money-money*0.2)+" - "+str(money+money*0.2), sentence)

    if "name or place" in sensitive_types:
        pass

    return sentence

# sanitize
doc = Document("test.docx")
sanitized_doc=Document()
output_paragraph=""

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    sentences=re.split("\.\s", original_text)
    for sentence in sentences:
        if is_sensitive(sentence):
            output_paragraph+=sanitize(sentence, sensitive_types)+". "
        else:
            output_paragraph+=sentence
        sensitive_types=[]

    output_paragraph=re.sub(poor_punctuation, "", output_paragraph)
    output_paragraph=re.sub(sanitization_whitespace_remnant, ".", output_paragraph)
    if len(output_paragraph)>0:
        sanitized_doc.add_paragraph(output_paragraph)
    output_paragraph=""

# output
sanitized_doc.save("generalized.docx")
print("Done! check save folder")

Done! check save folder


## data aggregation
**description:** converting a dataset from a list of records to summarised values  
**implementation:** statistical measures can be used. typical ways include using totals or averages, etc.

In [None]:
# WIP