# all technique source code
### technique authorship
*Ayrton's techniques*
1. character masking
2. synthetic data
3. data perturbation  

*Yi Thng's techniques*
1. record surpression
2. generalisation
3. data aggregation

*Ethan's techniques*
1. pseudonymisation
2. swapping
3. attribute surpression

## character masking

### for word documents
**description:**  
**implementation:**

In [None]:
from docx import Document

def mask_data(text):
    masked_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Mask names
            masked_text += mask_name(word) + ' '
        elif is_email(word):  # Mask email addresses
            masked_text += mask_email(word) + ' '
        elif is_number(word):  # Mask numbers (including phone numbers)
            masked_text += mask_number(word) + ' '
        else:
            masked_text += word + ' '
    return masked_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def mask_name(name):
    # Mask a name by replacing characters with 'X'
    masked_name = ''
    for char in name:
        if char.isalnum():
            masked_name += 'X'
        else:
            masked_name += char
    return masked_name

def mask_email(email):
    # Mask an email address by replacing the entire email address with 'X'
    return 'X' * len(email)

def mask_number(number):
    # Mask a number (including phone numbers) by replacing digits with 'X'
    return 'X' * len(number)

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    masked_text = mask_data(original_text)
    paragraph.text = masked_text

# Save the modified document
doc.save('masked.docx')
print("Done! Check save folder")

### for excel spreadsheets
**description:**  
**implementation:**

In [None]:
from openpyxl import load_workbook


def mask_string(string):
    return '*' * len(string)


def mask_names(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str):
                words = cell.value.split()
                masked_words = [mask_string(word) for word in words]
                masked_value = ' '.join(masked_words)
                cell.value = masked_value


def mask_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, (int, float)):
                cell.value = mask_string(str(cell.value))


def mask_phone_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and any(char.isdigit() for char in cell.value):
                masked_value = ''.join(mask_string(char) if char.isdigit() else char for char in cell.value)
                cell.value = masked_value


def mask_email_addresses(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and '@' in cell.value:
                parts = cell.value.split('@')
                masked_username = mask_string(parts[0])
                masked_value = masked_username + '@' + parts[1]
                cell.value = masked_value


def mask_xlsx_file(filename):
    workbook = load_workbook(filename)
    for sheet in workbook:
        mask_names(sheet)
        mask_numbers(sheet)
        mask_phone_numbers(sheet)
        mask_email_addresses(sheet)
    workbook.save(f"masked_{filename}")


# Usage example
filename = "test.xlsx"
mask_xlsx_file(filename)
print("Done")

## synthetic data

### for word documents
**description:**  
**implementation:**

In [None]:
from faker import Faker
from docx import Document

faker = Faker()

# Example original document
document = Document('test.docx')

# Function to sanitize text
def sanitize_text(text):
    # Split the text into words
    words = text.split()
    # Generate a fake word for each original word
    fake_words = [faker.word() for _ in words]
    # Combine the fake words into a string
    fake_text = ' '.join(fake_words)
    return fake_text

# Sanitize the document
for paragraph in document.paragraphs:
    # Sanitize the text in the paragraph
    sanitized_text = sanitize_text(paragraph.text)
    # Replace the original text with the sanitized text
    paragraph.text = sanitized_text

# Save the sanitized document
document.save('synthetic.docx')
print("Done, Check save folder")

### for excel spreadsheets
**description:**  
**implementation:**

In [None]:
from faker import Faker
from openpyxl import load_workbook

faker = Faker()

# Example original XLSX file
filename = 'test.xlsx'

# Function to sanitize data
def sanitize_data(data):
    # Generate fake data based on the original data type
    if isinstance(data, str):
        return faker.word()
    elif isinstance(data, int):
        return faker.random_number()
    elif isinstance(data, float):
        return faker.random_number(digits=2)
    else:
        return data

# Load the workbook
wb = load_workbook(filename)

# Iterate over each sheet in the workbook
for sheet in wb.sheetnames:
    ws = wb[sheet]

    # Iterate over each cell in the sheet
    for row in ws.iter_rows():
        for cell in row:
            # Sanitize the cell value
            cell.value = sanitize_data(cell.value)

# Save the sanitized workbook
sanitized_filename = 'sanitized_' + filename
wb.save(sanitized_filename)
print("Done")

## data perturbation

### for word documents
**description:**  
**implementation:**

In [None]:
from docx import Document
import random
import string

def perturb_data(text):
    perturbed_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Perturb names
            perturbed_text += perturb_name(word) + ' '
        elif is_email(word):  # Perturb email addresses
            perturbed_text += perturb_email(word) + ' '
        elif is_number(word):  # Perturb numbers (including phone numbers)
            perturbed_text += perturb_number(word) + ' '
        else:
            perturbed_text += word + ' '
    return perturbed_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word and word.index('@') < word.rindex('.')

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def perturb_name(name):
    # Perturb a name by randomly replacing characters
    perturbed_name = ''
    for char in name:
        if char.isalnum():
            perturbed_name += random.choice(string.ascii_letters)
        else:
            perturbed_name += char
    return perturbed_name

def perturb_email(email):
    # Perturb an email address by randomly changing characters before the '@' symbol
    username, domain = email.split('@')
    perturbed_username = perturb_name(username)
    return perturbed_username + '@' + domain

def perturb_number(number):
    # Perturb a number (including phone numbers) by randomly changing digits
    perturbed_number = ''
    for digit in number:
        if digit.isdigit():
            perturbed_number += random.choice(string.digits)
        else:
            perturbed_number += digit
    return perturbed_number

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    perturbed_text = perturb_data(original_text)
    paragraph.text = perturbed_text

# Save the modified document
doc.save('perturbed.docx')
print("Done! Check save folder")

### for excel spreadsheet
**description:**  
**implementation:**

In [None]:
import random
from openpyxl import load_workbook


def perturb_string(string):
    perturbed_chars = []
    for char in string:
        if char.isalpha():
            perturbed_chars.append(chr(ord(char) + random.randint(1, 5)))
        elif char.isdigit():
            perturbed_chars.append(str((int(char) + random.randint(1, 5)) % 10))
        else:
            perturbed_chars.append(char)
    return ''.join(perturbed_chars)


def perturb_names(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str):
                words = cell.value.split()
                perturbed_words = [perturb_string(word) for word in words]
                perturbed_value = ' '.join(perturbed_words)
                cell.value = perturbed_value


def perturb_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, (int, float)):
                cell.value = perturb_string(str(cell.value))


def perturb_phone_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and any(char.isdigit() for char in cell.value):
                perturbed_value = ''.join(perturb_string(char) if char.isdigit() else char for char in cell.value)
                cell.value = perturbed_value


def perturb_email_addresses(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and '@' in cell.value:
                parts = cell.value.split('@')
                perturbed_username = perturb_string(parts[0])
                perturbed_value = perturbed_username + '@' + parts[1]
                cell.value = perturbed_value


def perturb_xlsx_file(filename):
    workbook = load_workbook(filename)
    for sheet in workbook:
        perturb_names(sheet)
        perturb_numbers(sheet)
        perturb_phone_numbers(sheet)
        perturb_email_addresses(sheet)
    workbook.save(f"perturbed_{filename}")


# Usage example
filename = "test.xlsx"
perturb_xlsx_file(filename)
print("Done")

## record surpression

### for word documents
**description:** record surpression refers to the removal of an entire record in the dataset. in contrast to most techniques, this technique affects multiple attributes at the same time  
**implementation:** delete an entire record. "Redacting" may not be sufficient if the underlying data remains accessible

In [None]:
from docx import Document
import re

regex={
    "email": "\S+@(\S+|\.\S+)",
    "age": "\d+\syears\sold",
    "money": "\$\d+(?:\,\d+|\d+)+(?:\.\d+)?",
    "phone number": "(?:\+65|\+65\s)?\d{4}\s?\d{4}",
    "name or place": "[A-Z][a-z]+\s?"
}

poor_punctuation="^(?:[^\w\s]+|\s+)+$|(?:[^\w\s](?:\s+)?){2,}|[^\w\s]{2,}|\s{2,}"
sanitization_whitespace_remnant="\s\."

sensitive_types=[]

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    for key in regex:
        if re.search(regex[key], sentence):
            sensitive=True
            sensitive_types.append(key)
    return sensitive

def sanitize(sentence, sensitive_types:list):
    for i in sensitive_types:
        sentence=re.sub(regex[i], "", sentence)
    return sentence

# sanitize
doc = Document("test.docx")
sanitized_doc=Document()
output_paragraph=""

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    sentences=re.split("\.\s", original_text)
    for sentence in sentences:
        if is_sensitive(sentence):
            output_paragraph+=sanitize(sentence, sensitive_types)+". "
        else:
            output_paragraph+=sentence
        sensitive_types=[]

    output_paragraph=re.sub(poor_punctuation, "", output_paragraph)
    output_paragraph=re.sub(sanitization_whitespace_remnant, ".", output_paragraph)
    if len(output_paragraph)>0:
        sanitized_doc.add_paragraph(output_paragraph)
    output_paragraph=""

# output
sanitized_doc.save("surpressed.docx")
print("Done! check save folder")

### for excel spreadsheets
**description:**  
**implementation:**

In [None]:
# WIP

## generalisation

### for word documents
**description:** a reduction in the precision of data by rephrasing into something more vague. for example turning age into a age range or a precise location into a district/country.  
**implementation:** design appropriate data categories and rules for translating data and surpress records that still stand out after translation

In [None]:
from docx import Document
import re

regex={
    "age": "\d+\syears\sold",
    "money": "\$\d+(?:\,\d+|\d+)+(?:\.\d+)?",
    "name or place": "[A-Z][a-z]+\s?"
}

poor_punctuation="^(?:[^\w\s]+|\s+)+$|(?:[^\w\s](?:\s+)?){2,}|[^\w\s]{2,}|\s{2,}"
sanitization_whitespace_remnant="\s\."

sensitive_types=[]

# define sensitive data
def is_sensitive(sentence):
    sensitive=False
    for key in regex:
        if re.search(regex[key], sentence):
            sensitive=True
            sensitive_types.append(key)
    return sensitive

def sanitize(sentence, sensitive_types:list):
    if "age" in sensitive_types:
        all_instances=re.findall(regex["age"], sentence)
        for matched in all_instances:
            age=re.search("\d+", matched)
            age=int(age.group())
            sentence=re.sub(re.escape(matched), str(age-10)+" - "+str(age+10), sentence)

    if "money" in sensitive_types:
        all_instances=re.findall(regex["money"], sentence)
        for matched in all_instances:
            money=float(re.sub("\$|,", "", matched))
            sentence=re.sub(re.escape(matched), str(money-money*0.2)+" - "+str(money+money*0.2), sentence)

    if "name or place" in sensitive_types:
        pass

    return sentence

# sanitize
doc = Document("test.docx")
sanitized_doc=Document()
output_paragraph=""

for paragraph in doc.paragraphs:
    original_text=paragraph.text
    sentences=re.split("\.\s", original_text)
    for sentence in sentences:
        if is_sensitive(sentence):
            output_paragraph+=sanitize(sentence, sensitive_types)+". "
        else:
            output_paragraph+=sentence
        sensitive_types=[]

    output_paragraph=re.sub(poor_punctuation, "", output_paragraph)
    output_paragraph=re.sub(sanitization_whitespace_remnant, ".", output_paragraph)
    if len(output_paragraph)>0:
        sanitized_doc.add_paragraph(output_paragraph)
    output_paragraph=""

# output
sanitized_doc.save("generalized.docx")
print("Done! check save folder")

### for excel spreadsheets
**description:**  
**implementation:**

In [None]:
# WIP

## data aggregation

### for excel spreadsheets
**description:** converting a dataset from a list of records to summarised values  
**implementation:** statistical measures can be used. typical ways include using totals or averages, etc.

In [None]:
# WIP

## pseudonymization

### for word documents
**description:**  
**implementation:**

In [None]:
import string
import random
from docx import Document

def mask_data(text):
    masked_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Mask names
            masked_text += generate_pseudonym(word) + ' '
        elif is_email(word):  # Mask email addresses
            masked_text += generate_pseudonym(word) + ' '
        elif is_number(word):  # Mask numbers (including phone numbers)
            masked_text += generate_pseudonym(word) + ' '
        else:
            masked_text += word + ' '
    return masked_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def generate_pseudonym(word):
    # Generate a pseudonym for a word
    pseudonym = ''.join(random.choice(string.ascii_letters) for _ in range(len(word)))
    return pseudonym

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    masked_text = mask_data(original_text)
    paragraph.text = masked_text

# Save the modified document
doc.save('pseudonyms.docx')
print("Done! Check save folder.")

### for excel spreadsheet
**description:**  
**implementation:**

note: example dataset used can be downloaded from the following link
https://www.kaggle.com/datasets/thedevastator/demographical-shopping-purchases-data

example data is very large (80 thousand rows 9 columns) so the code will take up to 14 mins and 30 secs maximum to run. the speed of the slowest algorithm is 10.875 rows processed per millisecond

In [None]:
import pandas as pd
import re
df=pd.read_csv("Demographic_Data_Orig.csv")

sanitized_df=pd.DataFrame()
sanitized_df = sanitized_df.reindex(columns=list(df.columns))
new_value=""
new_row={}

age_regex="\d{1,3}(?:\.\d{1,3}){3}"
for rowIndex, row in df.iterrows():

    for columnIndex, value in row.items():
        value=str(value)
        if re.search(age_regex, value):
            new_value=re.sub(age_regex, "IP"+str(rowIndex), value)
            new_row[str(columnIndex)]=new_value
        else:
            new_row[str(columnIndex)]=value

    new_row=pd.Series(new_row)
    sanitized_df=pd.concat([sanitized_df, new_row.to_frame().T], ignore_index=True)

display(sanitized_df)

sanitized_df.to_excel("sanitized_output.xlsx")

## swapping

### for word documents
**description:**  
**implementation:**

In [None]:
import random
from docx import Document

def swap_data(text):
    words = text.split()
    swapped_words = []
    for word in words:
        if is_name(word):  # Swap names
            swapped_words.append(swap_name(word))
        elif is_email(word):  # Swap email addresses
            swapped_words.append(swap_email(word))
        elif is_number(word):  # Swap numbers (including phone numbers)
            swapped_words.append(swap_number(word))
        else:
            swapped_words.append(word)
    return ' '.join(swapped_words)

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

def swap_name(name):
    # Swap a name by randomly shuffling its characters
    name_characters = list(name)
    random.shuffle(name_characters)
    return ''.join(name_characters)

def swap_email(email):
    # Swap an email address by randomly shuffling its characters
    email_parts = email.split('@')
    username = email_parts[0]
    domain = email_parts[1]
    username_characters = list(username)
    random.shuffle(username_characters)
    return ''.join(username_characters) + '@' + domain

def swap_number(number):
    # Swap a number (including phone numbers) by randomly shuffling its digits
    number_digits = list(number)
    random.shuffle(number_digits)
    return ''.join(number_digits)

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    swapped_text = swap_data(original_text)
    paragraph.text = swapped_text

# Save the modified document
doc.save('swapped.docx')
print("Done! Check save folder.")


### for excel spreadsheets
**description:**  
**implementation:**

note: example dataset used can be downloaded from the following link
https://www.kaggle.com/datasets/thedevastator/demographical-shopping-purchases-data

In [None]:
import pandas as pd
import re
df=pd.read_csv("Demographic_Data_Orig.csv")

age_regex="old|age"
all_sensitive_columns=[]
new_column=[]

for column in df:
    if re.match(age_regex, column):
        all_sensitive_columns.append(column)

for sensitive_column in all_sensitive_columns:
    new_column=df[sensitive_column]
    
    for index in range(0, len(new_column)):
        new_value=float(new_column[index])
        new_value=int(round(new_value, -1))
        lower_range=new_value-10
        upper_range=new_value+10
        swap_value=str(lower_range)+"-"+str(upper_range)
        new_column[index]=re.sub("\d{1,2}", swap_value, str(new_column[index]))

    df[sensitive_column]=new_column

display(df)
sanitized_df.to_excel("sanitized_output.xlsx")

## attribute surpression

### for word documents
**description:**  
**implementation:**

In [None]:
from docx import Document

def suppress_data(text):
    suppressed_text = ''
    words = text.split()
    for word in words:
        if is_name(word):  # Suppress names
            suppressed_text += ''  # Suppress the name by not appending it
        elif is_email(word):  # Suppress email addresses
            suppressed_text += ''  # Suppress the email address by not appending it
        elif is_number(word):  # Suppress numbers (including phone numbers)
            suppressed_text += ''  # Suppress the number by not appending it
        else:
            suppressed_text += word + ' '  # Append non-sensitive words
    return suppressed_text.strip()

def is_name(word):
    # Check if the word is likely to be a name
    return word[0].isalpha() and word[0].isupper()

def is_email(word):
    # Check if the word is likely to be an email address
    return '@' in word and '.' in word

def is_number(word):
    # Check if the word is likely to be a number (including phone numbers)
    return word.isdigit() or is_phone_number(word)

def is_phone_number(word):
    # Check if the word is likely to be a phone number
    return any(char.isdigit() for char in word)

# Open the Word document
doc = Document('test.docx')

# Process each paragraph in the document
for paragraph in doc.paragraphs:
    original_text = paragraph.text
    suppressed_text = suppress_data(original_text)
    paragraph.text = suppressed_text

# Save the modified document
doc.save('suppress.docx')
print("Done! Check save folder.")


### for excel spreadsheets
**description:**  
**implementation:**

In [None]:
import random
from openpyxl import load_workbook


def suppress_names(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str):
                words = cell.value.split()
                suppressed_words = ['X' * len(word) if word.isalpha() else word for word in words]
                suppressed_value = ' '.join(suppressed_words)
                cell.value = suppressed_value


def suppress_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, (int, float)):
                cell.value = 'N/A'


def suppress_phone_numbers(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and any(char.isdigit() for char in cell.value):
                suppressed_value = ''.join('X' if char.isdigit() else char for char in cell.value)
                cell.value = suppressed_value


def suppress_email_addresses(sheet):
    for row in sheet.iter_rows():
        for cell in row:
            if isinstance(cell.value, str) and '@' in cell.value:
                cell.value = 'X' * len(cell.value)


def suppress_attributes_xlsx_file(filename):
    workbook = load_workbook(filename)
    for sheet in workbook:
        suppress_names(sheet)
        suppress_numbers(sheet)
        suppress_phone_numbers(sheet)
        suppress_email_addresses(sheet)
    workbook.save(f"suppressed_{filename}")


# Usage example
filename = "test.xlsx"
suppress_attributes_xlsx_file(filename)
print("Done")