# Data Anonymization techniques

- Masking: Hiding part of sensitive data (e.g., email, SSN, phone).
- Hashing: Replacing data with irreversible hashed values (ideal for IDs).
- Tokenization/Pseudonymization: Replacing sensitive values with random tokens that can be reversed later using a lookup table.
- Redaction/Removal: Completely removing sensitive columns or rows.
- Generalization:  Reducing precision to hide exact values.
- Noise Addition: Altering values slightly to hide exact data (esp. for analytics).
- NER - Named entity recognition

[microsoft presidio](https://microsoft.github.io/presidio/)

In [0]:
# setup -optional 

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("UsersAnynomization").getOrCreate()



In [0]:
# load the user data
from pyspark.sql.functions import *

users_df = spark.read.csv("/Volumes/workspace/2235-wk3/optional", header=True, inferSchema=True)


In [0]:
# masking email 

df_anonymous = users_df.withColumn("email_masked", regexp_replace("email", r"(^[^@]+)", "*****")).drop("email")

df_anonymous.show(9)


In [0]:
# hashing SSN credit card

df_anonymous = df_anonymous.withColumn("ssn_hashed", sha2(col("ssn"), 256)) \
    .withColumn("credit_card_hashed", sha2(col("credit_card").cast("string"), 256)) \
        .drop("ssn", "credit_card")

df_anonymous.show(9)




In [0]:
# generaliation

df_anonymous = df_anonymous.withColumn("age_range", when(col("age") <18, "<18")
                .when(col("age") <30 , "18-29")
                .when(col("age") <50 , "30-49")
                .otherwise("50+")).drop("age", "dob")




In [0]:
# noise addition to salary

df_anonymous = df_anonymous.withColumn("salary_noise", (col("salary") + (rand() *100000 - 500)).cast("double")).drop("salary")

df_anonymous.show(9)



In [0]:
# save the changes

df_anonymous.write.saveAsTable("users_cleaned")


In [0]:
! pip install presidio-analyzer presidio_structured presidio-anonymizer faker pandas && python -m spacy download en_core_web_lg


In [0]:
# NER

import pandas as pd
from faker import Faker
from presidio_structured import StructuredEngine, PandasAnalysisBuilder
from presidio_anonymizer.entities import OperatorConfig
from datetime import datetime


pandas_users_df = users_df.toPandas()


In [0]:

pandas_users_df = pandas_users_df.astype("str")

In [0]:
pandas_engine = StructuredEngine()
tabular_analysis = PandasAnalysisBuilder().generate_analysis(pandas_users_df)

In [0]:
fake = Faker()

operators = {
    "PERSON": OperatorConfig("replace", {"new_value" : "REDACTED"}),
    "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda" : lambda x:fake.safe_email()}),
    "US_SSN": OperatorConfig("custom", {"lambda": lambda x:fake.ssn()}),
    "DATE_TIME": OperatorConfig("custom",{"lambda": lambda x: fake.date_between_dates(date_start=datetime(1940,1,1), date_end=datetime(2010,1,1))}),
    "US_BANK_NUMBER": OperatorConfig("replace", {"new_value" :"REDACTED"})
}

In [0]:
anonymized_pd_df = pandas_engine.anonymize(pandas_users_df, tabular_analysis, operators=operators)



In [0]:
anonymized_users_df = spark.createDataFrame(anonymized_pd_df)

anonymized_users_df.drop("age").show()