### Step-by-step anonymisation of MVR data

#### TODOS:

- Hash number plate, look at: https://towardsdatascience.com/anonymise-sensitive-data-in-a-pandas-dataframe-column-with-hashlib-8e7ef397d91f
- Apply k-anonymity (e.g., Mondrian) to selected columns, such as Age, Year, or
- Put Age into pre-defined groups, eg., 0-14, 15-24, 25 - 44, 45-64, 65+
- Apply Faker on names
- Create an abstract ColumnAnonymiser class, takes column data type, and target policy as inputs. Implement AgeAnonymiser, NZNumberPlateAnonymiser

In [14]:
import pandas as pd
import csv
import re

from collections import defaultdict

In [15]:
df = pd.read_csv('mvr_synthetic_data.csv')

In [16]:
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52


In [17]:
def anonymise_car_number_plates(text):
    def replace_alphabets_numbers(match):
        plate = match.group()
        anonymized_plate = re.sub(r'[A-Za-z]', 'X', plate)
        anonymized_plate = re.sub(r'\d', '0', anonymized_plate)
        return anonymized_plate

    # Define the regular expression pattern to match car number plates
    number_plate_pattern = r'\b[A-Za-z]{3}-\d{4}\b'

    # Replace all occurrences of number plates with the anonymized string
    anonymized_text = re.sub(number_plate_pattern, replace_alphabets_numbers, text)

    return anonymized_text

# Example usage
text = "ABC-1234"
anonymized_text = anonymise_car_number_plates(text)
print(anonymized_text)

XXX-0000


In [18]:
df['anonymised_NumberPlate'] = df['NumberPlate'].apply(lambda x: anonymise_car_number_plates(x))

In [19]:
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age,anonymised_NumberPlate
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34,XXX-0000
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28,XXX-0000
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45,XXX-0000
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31,XXX-0000
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52,XXX-0000


In [27]:
## use faker_vehicle to anonymise number plate
from faker import Faker
from faker_vehicle import VehicleProvider

# Anonymise the NumberPlate column
df['NumberPlateFaked'] = df['NumberPlate'].apply(lambda x: vehicle_provider.numerify('???-####'))

df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age,NumberPlateFaked
0,John Smith,Toyota,Corolla,2017,XXX-234,Male,25-44,???-0191
1,Jane Doe,Honda,Civic,2019,XXX-678,Female,15-24,???-0007
2,Michael Johnson,Ford,Focus,2018,XXX-321,Male,25-44,???-2184
3,Emily Brown,Nissan,Altima,2016,XXX-876,Female,25-44,???-6326
4,Daniel Davis,Chevrolet,Impala,2020,XXX-543,Male,45-64,???-5483


In [20]:
def compute_equivalence_classes(df, quasi_identifiers):
    eq_classes = defaultdict(int)
    
    for index, row in df.iterrows():
        key = tuple(row[qi] for qi in quasi_identifiers)
        eq_classes[key] += 1
    
    return eq_classes

def is_k_anonymous(dataset, quasi_identifiers, k):
    eq_classes = compute_equivalence_classes(dataset, quasi_identifiers)
    
    for count in eq_classes.values():
        if count < k:
            return False
            
    return True

In [21]:
# Define the list of quasi-identifiers
# quasi_identifiers = ['CarMake', 'CarModel', 'Year']

# quasi_identifiers = ['anonymised_NumberPlate']

# quasi_identifiers = ['Age', 'Gender']

quasi_identifiers = ['Gender', 'anonymised_NumberPlate']


# Define the k value
k = 3

# Check if the dataset is k-anonymous
k_anonymous = is_k_anonymous(df, quasi_identifiers, k)
print(f"The dataset is {k}-anonymous: {k_anonymous}")

The dataset is 3-anonymous: True


### 2. Apply k-anonymity (e.g., Mondrian) to selected columns, such as Age, Year, or

In [22]:
# Define the list of quasi-identifiers
quasi_identifiers = [
    ['CarMake', 'CarModel', 'Year'],
    ['anonymised_NumberPlate'],
    ['Age', 'Gender'],
    ['Gender', 'anonymised_NumberPlate']
]

# Apply is_k_anonymous to each element in the big list
k = 3  # Specify the desired k value for k-anonymity
for qi in quasi_identifiers:
    is_k = is_k_anonymous(df, qi, k)
    print(f"{qi} is k-anonymous: {is_k}")

['CarMake', 'CarModel', 'Year'] is k-anonymous: False
['anonymised_NumberPlate'] is k-anonymous: True
['Age', 'Gender'] is k-anonymous: False
['Gender', 'anonymised_NumberPlate'] is k-anonymous: True


### 3. Put Age into pre-defined groups, eg., 0-14, 15-24, 25 - 44, 45-64, 65+

In [23]:
# Define age groups
age_groups = {
    '0-14': range(0, 15),
    '15-24': range(15, 25),
    '25-44': range(25, 45),
    '45-64': range(45, 65),
    '65+': range(65, 110)
}

# Define a function to map age to age group
def map_age_to_group(age):
    for group, age_range in age_groups.items():
        if age in age_range:
            return group
    return None  # Return None if age is not in any of the defined age groups

# Apply the age grouping function to the Age column
df['AgeGroup'] = df['Age'].apply(map_age_to_group)
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age,anonymised_NumberPlate,AgeGroup
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34,XXX-0000,25-44
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28,XXX-0000,25-44
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45,XXX-0000,45-64
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31,XXX-0000,25-44
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52,XXX-0000,45-64


### 3. Apply Faker on names

In [24]:
from faker import Faker
# to create and initialize a faker generator, which can generate data by 
# accessing properties named after the type of data you want.
fake = Faker()

# Apply the fake name generator to the Name column
df['NameFaked'] = df['Name'].apply(lambda x: fake.name())
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age,anonymised_NumberPlate,AgeGroup,NameFaked
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34,XXX-0000,25-44,Elizabeth Dickson
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28,XXX-0000,25-44,Dr. Marie Potts MD
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45,XXX-0000,45-64,Jeanette Smith
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31,XXX-0000,25-44,Lisa Adams
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52,XXX-0000,45-64,Kyle Garcia


### 4. Create an abstract ColumnAnonymiser class, takes column data type, and target policy as inputs. Implement AgeAnonymiser, NZNumberPlateAnonymiser

In [25]:
from abc import ABC, abstractmethod

class ColumnAnonymiser(ABC):
    def __init__(self, column_data_type, target_policy):
        self.column_data_type = column_data_type
        self.target_policy = target_policy

    @abstractmethod
    def anonymise(self, column_data):
        pass

class AgeAnonymiser(ColumnAnonymiser):
    def anonymise(self, column_data):
        # Implement age anonymisation logic according to the target policy
        if self.target_policy == 'age_group':
            age_groups = {'0-14': range(0, 15), '15-24': range(15, 25), '25-44': range(25, 45), '45-64': range(45, 65), '65+': range(65, 200)}
            age_group_map = {age_range: age_group for age_group, age_ranges in age_groups.items() for age_range in age_ranges}
            return column_data.apply(lambda x: age_group_map.get(x // 10 * 10, 'Unknown'))
        else:
            return column_data

class NZNumberPlateAnonymiser(ColumnAnonymiser):
    def anonymise(self, column_data):
        # Implement New Zealand number plate anonymisation logic according to the target policy
        if self.target_policy == 'faker_vehicle_number_plate':
            return column_data.apply(lambda x: f'XXX-{str(x)[-3:]}') # anonymises by replacing the last three digits with XXX
        
        else:
            return column_data


In [28]:
import pandas as pd
from faker import Faker
from faker_vehicle import VehicleProvider

# new df 
df = pd.read_csv('mvr_synthetic_data.csv')

# Create a fake dataframe
fake = Faker()
vehicle_provider = VehicleProvider(fake)

# Anonymise the Age and NumberPlate columns
age_anonymiser = AgeAnonymiser(column_data_type='age', target_policy='age_group')
df['Age'] = age_anonymiser.anonymise(df['Age'])

number_plate_anonymiser = NZNumberPlateAnonymiser(column_data_type='number_plate', target_policy='last_three_digits')
df['NumberPlate'] = number_plate_anonymiser.anonymise(df['NumberPlate'])

df.head()


Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age
0,John Smith,Toyota,Corolla,2017,XXX-234,Male,25-44
1,Jane Doe,Honda,Civic,2019,XXX-678,Female,15-24
2,Michael Johnson,Ford,Focus,2018,XXX-321,Male,25-44
3,Emily Brown,Nissan,Altima,2016,XXX-876,Female,25-44
4,Daniel Davis,Chevrolet,Impala,2020,XXX-543,Male,45-64
