<a href="https://colab.research.google.com/github/MwangiMuriuki2003/MURIUKI/blob/main/Data_Anonymization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Install required libraries
!pip install pandas numpy faker openpyxl

# Import libraries
import pandas as pd
import numpy as np
from faker import Faker
import hashlib
import re
from datetime import datetime
import random

# Set random seed for reproducibility
np.random.seed(42)
fake = Faker()
Faker.seed(42)

# Upload file (use Colab's file upload)
from google.colab import files
uploaded = files.upload()

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


Saving mobile_customers.xlsx to mobile_customers (1).xlsx


In [6]:
# Load the Excel file
df = pd.read_excel('mobile_customers.xlsx')

# Display basic info
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)

Dataset shape: (10000, 19)

Column names:
['Unnamed: 0', 'customer_id', 'date_registered', 'username', 'name', 'gender', 'address', 'email', 'birthdate', 'current_location', 'residence', 'employer', 'job', 'age', 'salary', 'credit_card_provider', 'credit_card_number', 'credit_card_security_code', 'credit_card_expire']

First few rows:
   Unnamed: 0                           customer_id date_registered  \
0           0  24c9d2d0-d0d3-4a90-9a3a-e00e4aac99bd      2021-09-29   
1           1  7b2bc220-0296-4914-ba46-d6cc6a55a62a      2019-08-17   
2           2  06febdf9-07fb-4a1b-87d7-a5f97d9a5faf      2019-11-01   
3           3  23df88e5-5dd3-46af-ac0d-0c6bd92e4b96      2021-12-31   
4           4  6069c2d7-7905-4993-a155-64f6aba143b1      2020-08-09   

          username             name gender  \
0     robertsbryan  Jonathan Snyder      M   
1          egarcia  Susan Dominguez      F   
2      turnermegan     Corey Hebert      M   
3  richardcampbell  Latasha Griffin      F   
4   ti

In [8]:
# Function to create consistent anonymous IDs
def create_anonymous_id(original_id, prefix="CUST"):
    """Create consistent anonymous ID using hash"""
    hash_obj = hashlib.md5(str(original_id).encode())
    return f"{prefix}_{hash_obj.hexdigest()[:8]}"

# Function to anonymize names based on gender
def anonymize_name(gender, index):
    """Generate anonymous name based on gender"""
    if pd.isna(gender):
        return f"Anonymous Person {index}"
    elif gender.upper() == 'M':
        return f"John Doe {index}"
    else:
        return f"Jane Doe {index}"

# Function to create salary ranges
def salary_to_range(salary):
    """Convert exact salary to range"""
    if pd.isna(salary):
        return "Unknown"
    elif salary < 50000:
        return "<50K"
    elif salary < 75000:
        return "50K-75K"
    elif salary < 100000:
        return "75K-100K"
    elif salary < 150000:
        return "100K-150K"
    else:
        return "150K+"

# Function to mask credit card numbers
def mask_credit_card(card_number):
    """Mask credit card number showing only last 4 digits"""
    if pd.isna(card_number):
        return "****-****-****-****"
    card_str = str(card_number)
    if len(card_str) >= 4:
        return f"****-****-****-{card_str[-4:]}"
    else:
        return "****-****-****-****"

# Function to anonymize addresses
def create_anonymous_address(index):
    """Create anonymous address"""
    street_num = random.randint(1, 9999)
    city_num = (index % 50) + 1
    state_code = chr(65 + (index % 26)) + chr(65 + ((index + 1) % 26))
    return f"{street_num} Anonymous Street, City {city_num}, State {state_code}"

In [9]:
# Create a copy of the dataframe
df_anon = df.copy()

# Anonymize direct identifiers
df_anon['customer_id'] = df_anon['customer_id'].apply(lambda x: create_anonymous_id(x, "CUST"))
df_anon['username'] = df_anon.index.map(lambda x: f"user{x+1}")
df_anon['name'] = df_anon.apply(lambda row: anonymize_name(row['gender'], row.name+1), axis=1)
df_anon['email'] = df_anon.index.map(lambda x: f"user{x+1}@example.com")

# Anonymize location data
df_anon['address'] = df_anon.index.map(create_anonymous_address)
df_anon['residence'] = df_anon.index.map(lambda x: f"{x+1000} Privacy Lane, Anonymous City, State XX")
df_anon['current_location'] = "[LOCATION_REMOVED]"

# Anonymize financial data
df_anon['salary'] = df_anon['salary'].apply(salary_to_range)
df_anon['credit_card_number'] = df_anon['credit_card_number'].apply(mask_credit_card)
df_anon['credit_card_security_code'] = "***"

# Anonymize birthdate (keep only year)
df_anon['birthdate'] = pd.to_datetime(df_anon['birthdate']).dt.year

# Rename columns to reflect anonymization
column_mapping = {
    'birthdate': 'birth_year',
    'salary': 'salary_range',
    'credit_card_number': 'masked_credit_card',
    'credit_card_security_code': 'masked_security_code'
}
df_anon = df_anon.rename(columns=column_mapping)

print("Anonymization completed!")
print(f"Original dataset: {df.shape}")
print(f"Anonymized dataset: {df_anon.shape}")

Anonymization completed!
Original dataset: (10000, 19)
Anonymized dataset: (10000, 19)


In [10]:
# Check for any remaining sensitive data
print("=== VALIDATION REPORT ===")
print("\nChecking for potential sensitive data:")

# Check if any original names remain
original_names = set(df['name'].dropna())
anonymized_names = set(df_anon['name'].dropna())
remaining_names = original_names.intersection(anonymized_names)
print(f"Original names remaining: {len(remaining_names)}")

# Check email patterns
original_emails = df['email'].dropna().tolist()[:5]
anonymized_emails = df_anon['email'].dropna().tolist()[:5]
print(f"\nOriginal emails (sample): {original_emails}")
print(f"Anonymized emails (sample): {anonymized_emails}")

# Display sample of anonymized data
print("\n=== SAMPLE ANONYMIZED DATA ===")
print(df_anon[['customer_id', 'username', 'name', 'email', 'birth_year', 'salary_range', 'masked_credit_card']].head())

=== VALIDATION REPORT ===

Checking for potential sensitive data:
Original names remaining: 0

Original emails (sample): ['marcus58@hotmail.com', 'alexanderkathy@hotmail.com', 'vwood@gmail.com', 'kathleen36@gmail.com', 'johnbest@hotmail.com']
Anonymized emails (sample): ['user1@example.com', 'user2@example.com', 'user3@example.com', 'user4@example.com', 'user5@example.com']

=== SAMPLE ANONYMIZED DATA ===
     customer_id username        name              email  birth_year  \
0  CUST_0a28351d    user1  John Doe 1  user1@example.com        1978   
1  CUST_577bb034    user2  Jane Doe 2  user2@example.com        1970   
2  CUST_a36177f3    user3  John Doe 3  user3@example.com        2009   
3  CUST_7a1e09cd    user4  Jane Doe 4  user4@example.com        1992   
4  CUST_97f9f0df    user5  Jane Doe 5  user5@example.com        1989   

  salary_range   masked_credit_card  
0      50K-75K  ****-****-****-9846  
1     75K-100K  ****-****-****-5979  
2        150K+  ****-****-****-2247  
3    1

In [11]:
# Save to new Excel file
df_anon.to_excel('mobile_customers_anonymized.xlsx', index=False)

# Download the file
files.download('mobile_customers_anonymized.xlsx')

print("Anonymized file saved and ready for download!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Anonymized file saved and ready for download!


In [12]:
# Create anonymization summary
anonymization_report = {
    'Original Records': len(df),
    'Anonymized Records': len(df_anon),
    'Original Columns': len(df.columns),
    'Anonymized Columns': len(df_anon.columns),
    'Fields Anonymized': [
        'customer_id', 'username', 'name', 'email',
        'address', 'residence', 'current_location',
        'birthdate -> birth_year', 'salary -> salary_range',
        'credit_card_number -> masked', 'security_code -> masked'
    ]
}

print("=== ANONYMIZATION SUMMARY ===")
for key, value in anonymization_report.items():
    print(f"{key}: {value}")

=== ANONYMIZATION SUMMARY ===
Original Records: 10000
Anonymized Records: 10000
Original Columns: 19
Anonymized Columns: 19
Fields Anonymized: ['customer_id', 'username', 'name', 'email', 'address', 'residence', 'current_location', 'birthdate -> birth_year', 'salary -> salary_range', 'credit_card_number -> masked', 'security_code -> masked']
