In [None]:
# Install required packages

!pip install pandas
!pip install cryptography
!pip install datetime
!pip install python-dateutil
!pip install faker

In [None]:
# Import required libraries

import pandas as pd
from cryptography.fernet import Fernet
from datetime import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
from faker import Faker

# Section 1: Setup

In [None]:
# Load the HR Employee Excel file into a DataFrame
employees = pd.read_csv("HR_employee_data.csv", parse_dates=['DOB'])

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Display the DataFrame
employees.head()

# Section 2: Masking

Data Masking is the process of modifying sensitive data to create fake but realistic versions of data.  It's great to use on social security numbers!

In [None]:
# Define a function to mask the SSN 
def mask_ssn(ssn):
    return '*' * (len(ssn) - 4) + ssn[-4:]

# Replace existing SSN column with masked SSN
employees['SSN'] = employees['SSN'].astype(str).apply(mask_ssn)

# Display the first 5 rows
employees.head()

# Section 3: Encryption

Encryptions uses a hash function which passes data through a formula that produces a string of characters as a result.  This is great to use on emails!

In [None]:
# Generate a random secret encryption key
key = Fernet.generate_key()

# Create a Fernet cipher object with the secret key
cipher = Fernet(key)

# Set data type for email to string
employees[['ADEmail']] = employees[['ADEmail']].astype(str)

# Define a function to encrypt the email address
def encrypt_email(email):
    encrypted_email = cipher.encrypt(email.encode())
    return encrypted_email

# Encrypt the existing email address in the DataFrame
employees['ADEmail'] = employees['ADEmail'].apply(encrypt_email)

# Display the first 5 rows
employees.head()

# Section 4: Generalization

Generalization is the deliberate removal of some of the data in order to make it less identifiable.  This works well with dates of birth!

In [None]:
# Convert the 'DOB' column to datetime
employees['DOB'] = pd.to_datetime(employees['DOB'])

# Calculate age from date of birth
def calculate_age(dob):
    today = datetime.today()
    age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    return age

# Calculate age and create a new column 'Age'
employees['Age'] = employees['DOB'].apply(calculate_age)

# Display the first 5 rows
employees.head()

In [None]:
# Define age ranges and alias for generalization
age_ranges = {
    (20, 29): '20s',
    (30, 39): '30s',
    (40, 49): '40s',
    (50, 59): '50s',
    (60, 69): '60s',
    (70, 79): '70s',
    (80, 89): '80s'
}

# Apply generalization to the 'Age' column
def generalize_age(age):
    for (lower, upper), category in age_ranges.items():
        if lower <= age <= upper:
            return category

# Add an age range column to the DataFrame
employees['AgeRange'] = employees['Age'].apply(generalize_age)

# Drop the DOB and Age columns
employees.drop(['DOB', 'Age'], axis=1, inplace=True)

# Display the first 5 rows
employees.head()

# Section 5: Perturbation

Perturbation applys round-numbering methods and adds random noise.  It's great on data like salary!

In [None]:
# Set a seed for reproducibility
np.random.seed(42)  # You can use any integer value as the seed, but we will use 42 since it's the meaning of life

# Define perturbation method for salary column (adding random noise)
def perturb_salary(salary):
    noise = np.random.normal(loc=0, scale=100000)  # Add Gaussian noise with mean 0 and standard deviation 1000
    return round(salary + noise)  # Add noise to the original value and round

# Apply perturbation to the existing 'Salary' column
employees['Salary'] = employees['Salary'].apply(perturb_salary)

# Display the first 5 rows
employees.head()

# Section 6: Pseudonymization

Pseudonymization replaces data with pseudonyms, or placeholder values.  Names are great to use for this technique!

In [None]:
# Set a seed for reproducibility
Faker.seed(42)  # You can use any integer value as the seed, but we will use 42 since it's the meaning of life

# Initialize Faker to generate synthetic names
faker = Faker()

# Generate replacement first names
replacement_first_names = [faker.first_name() for _ in range(len(employees))]

# Generate replacement last names
replacement_last_names = [faker.last_name() for _ in range(len(employees))]

# Create the name mapping
name_mapping = dict(zip(zip(employees['FirstName'], employees['LastName']), zip(replacement_first_names, replacement_last_names)))

# Define a function to pseudonymize names
def pseudonymize_names(row):
    return name_mapping[(row['FirstName'], row['LastName'])]

# Pseudonymize names by replacing with replacement names
employees[['Pseudonym_First_Name', 'Pseudonym_Last_Name']] = employees.apply(pseudonymize_names, axis=1, result_type='expand')

# Save the pseudonymized dataset
employees.to_csv('pseudonymized_employee_data.csv', index=False)

# Drop the original 'First_Name' and 'Last_Name' columns
employees.drop(['FirstName', 'LastName'], axis=1, inplace=True)

# Display the first 5 rows
employees.head()

# Conclusion

Now all of our sensitive data is anonymized, YAY!