### Step-by-step anonymisation of MVR data

#### TODOS:

- Hash number plate, look at: https://towardsdatascience.com/anonymise-sensitive-data-in-a-pandas-dataframe-column-with-hashlib-8e7ef397d91f
- Apply k-anonymity (e.g., Mondrian) to selected columns, such as Age, Year
- Apply Faker on names

In [1]:
import pandas as pd
import csv
import re

from collections import defaultdict

In [2]:
df = pd.read_csv('mvr_synthetic_data.csv')

In [3]:
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52


In [4]:
def anonymise_car_number_plates(text):
    def replace_alphabets_numbers(match):
        plate = match.group()
        anonymized_plate = re.sub(r'[A-Za-z]', 'X', plate)
        anonymized_plate = re.sub(r'\d', '0', anonymized_plate)
        return anonymized_plate

    # Define the regular expression pattern to match car number plates
    number_plate_pattern = r'\b[A-Za-z]{3}-\d{4}\b'

    # Replace all occurrences of number plates with the anonymized string
    anonymized_text = re.sub(number_plate_pattern, replace_alphabets_numbers, text)

    return anonymized_text

# Example usage
text = "ABC-1234"
anonymized_text = anonymise_car_number_plates(text)
print(anonymized_text)

XXX-0000


In [5]:
df['anonymised_NumberPlate'] = df['NumberPlate'].apply(lambda x: anonymise_car_number_plates(x))

In [6]:
df.head()

Unnamed: 0,Name,CarMake,CarModel,Year,NumberPlate,Gender,Age,anonymised_NumberPlate
0,John Smith,Toyota,Corolla,2017,ABC-1234,Male,34,XXX-0000
1,Jane Doe,Honda,Civic,2019,XYZ-5678,Female,28,XXX-0000
2,Michael Johnson,Ford,Focus,2018,JKL-4321,Male,45,XXX-0000
3,Emily Brown,Nissan,Altima,2016,MNO-9876,Female,31,XXX-0000
4,Daniel Davis,Chevrolet,Impala,2020,PQR-6543,Male,52,XXX-0000


In [7]:
def compute_equivalence_classes(df, quasi_identifiers):
    eq_classes = defaultdict(int)
    
    for index, row in df.iterrows():
        key = tuple(row[qi] for qi in quasi_identifiers)
        eq_classes[key] += 1
    
    return eq_classes

def is_k_anonymous(dataset, quasi_identifiers, k):
    eq_classes = compute_equivalence_classes(dataset, quasi_identifiers)
    
    for count in eq_classes.values():
        if count < k:
            return False
            
    return True

In [8]:
# Define the list of quasi-identifiers
# quasi_identifiers = ['CarMake', 'CarModel', 'Year']

# quasi_identifiers = ['anonymised_NumberPlate']

# quasi_identifiers = ['Age', 'Gender']

quasi_identifiers = ['Gender']


# Define the k value
k = 5

# Check if the dataset is k-anonymous
k_anonymous = is_k_anonymous(df, quasi_identifiers, k)
print(f"The dataset is {k}-anonymous: {k_anonymous}")

The dataset is 5-anonymous: True
