In [None]:
!pip install pandas
!pip install tabulate
import pandas as pd

from tabulate import tabulate



In [None]:
!pip install Faker

import pandas as pd
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Number of records to generate (30 people)
num_samples = 30

# Predefined options for certain columns
store_categories = ["Electronics", "Clothing", "Groceries", "Home Decor", "Books"]
transaction_types = ["cash", "upi", "netbanking"]
email_domains = ["gmail.com", "yahoo.com", "hotmail.com"]

# Generate synthetic data for each column
data = {
    "Name": [fake.name() for _ in range(num_samples)],
    "Date": [fake.date_between(start_date="-30d", end_date="today") for _ in range(num_samples)],
    "Time": [fake.time() for _ in range(num_samples)],
    "Store Category": [random.choice(store_categories) for _ in range(num_samples)],
    "Email": [f"{fake.user_name()}@{random.choice(email_domains)}" for _ in range(num_samples)],
    "Phone Number": [f"+91{random.randint(6000000000, 9999999999)}" for _ in range(num_samples)],
    "Address": [fake.address().replace("\n", ", ") for _ in range(num_samples)],
    "Transaction Type": [random.choice(transaction_types) for _ in range(num_samples)],
    "Amount": [round(random.uniform(10.0, 500.0), 2) for _ in range(num_samples)],
    "Zip Code": [fake.zipcode() for _ in range(num_samples)]
}

# Create the DataFrame
df_retail = pd.DataFrame(data)

# Display the first few rows of the dataset
print("===== Synthetic Retail Dataset (30 Records) =====")
print(df_retail.head())

# Save the DataFrame to an Excel file
df_retail.to_excel('/content/Retail_Stop_Dataset_Final.xlsx', index=False)
print("\nDataset saved to '/content/Retail_Stop_Dataset_Final.xlsx'")


===== Synthetic Retail Dataset (30 Records) =====
                Name        Date      Time Store Category  \
0      Bridget Jones  2025-02-12  11:10:46          Books   
1      Rebecca Chang  2025-01-25  09:25:33          Books   
2       Laura Howell  2025-02-01  08:17:10          Books   
3  Alexander Mcbride  2025-02-19  04:00:36          Books   
4       Sandra Hines  2025-01-27  00:08:46     Home Decor   

                       Email   Phone Number  \
0     kristibrooks@yahoo.com  +916885985401   
1         iosborne@gmail.com  +918661580241   
2  kimberlyjones@hotmail.com  +919770391418   
3  mcdonaldpaula@hotmail.com  +919692841050   
4  matthewthompson@yahoo.com  +919921746539   

                                             Address Transaction Type  Amount  \
0  29018 Ortiz Circle Suite 116, Lake Victoria, N...             cash  354.74   
1         928 Anderson Village, Clarkmouth, NC 22259             cash  293.38   
2  295 Amanda Squares Apt. 938, East Rebecca, MA ...     

In [None]:
# Re-load the dataset to ensure consistency
df_gas = pd.read_excel('/content/Retail_Stop_Dataset_Final.xlsx')

# Step 1: Display Original Dataset
print("\n===================== ORIGINAL DATASET =====================\n")
print(tabulate(df_gas.head(), headers='keys', tablefmt='grid'))
print("\n============================================================\n")

# Step 2: Apply K-Anonymity
# Hidden (Email): Mask Email
df_gas['Email'] = df_gas['Email'].apply(lambda x: x[:3] + "***@" + x.split('@')[1] if pd.notna(x) else x)

# Sensitive (Phone Number): Partially mask Phone Number
df_gas['Phone Number'] = df_gas['Phone Number'].astype(str).apply(lambda x: "***-***-" + x[-3:])

# Data Masking (Address): Generalize to Zip Code level
df_gas['Address'] = df_gas['Zip Code']
df_gas['Zip Code'] = df_gas['Zip Code'].astype(str).apply(lambda x: x[:3] + "**" if pd.notna(x) else x)

print("\n===================== AFTER APPLYING K-ANONYMITY =====================\n")
print(tabulate(df_gas[['Name','Date','Time','Store Category','Email', 'Phone Number', 'Address', 'Transaction Type', 'Amount','Zip Code']].head(), headers='keys', tablefmt='grid'))
print("\n======================================================================\n")

# Step 3: Apply L-Diversity
# Ensure diversity in Transaction Type within groups
def check_l_diversity(group, column, l):
    return group[column].nunique() >= l

df_gas_ldiversity = df_gas.groupby(['Store Category']).filter(lambda x: check_l_diversity(x, 'Transaction Type', 2))

print("\n===================== AFTER APPLYING L-DIVERSITY =====================\n")
print(tabulate(df_gas_ldiversity[['Name','Date','Time','Store Category','Email', 'Phone Number', 'Address', 'Transaction Type', 'Amount','Zip Code']].head(), headers='keys', tablefmt='grid'))
print("\n======================================================================\n")

# Step 4: Apply T-Closeness
# Adjusted to ensure at least one row per group
amount_distribution = df_gas_ldiversity['Amount'].value_counts(normalize=True)

def t_closeness_sample(group, dist):
    frac = dist[group.name]
    sample_size = max(1, int(len(group) * frac))  # Ensure at least one sample per group
    return group.sample(n=sample_size, random_state=42)

df_gas_tcloseness = df_gas_ldiversity.groupby('Amount', group_keys=False).apply(lambda x: t_closeness_sample(x, amount_distribution))

print("\n===================== AFTER APPLYING T-CLOSENESS =====================\n")
print(tabulate(df_gas_tcloseness[['Name','Date','Time','Store Category','Email', 'Phone Number', 'Address', 'Transaction Type', 'Amount','Zip Code']].head(), headers='keys', tablefmt='grid'))
print("\n======================================================================\n")



+----+-------------------+---------------------+----------+------------------+---------------------------+----------------+-------------------------------------------------------+--------------------+----------+------------+
|    | Name              | Date                | Time     | Store Category   | Email                     |   Phone Number | Address                                               | Transaction Type   |   Amount |   Zip Code |
|  0 | Bridget Jones     | 2025-02-12 00:00:00 | 11:10:46 | Books            | kristibrooks@yahoo.com    |   916885985401 | 29018 Ortiz Circle Suite 116, Lake Victoria, NY 68841 | cash               |   354.74 |      12504 |
+----+-------------------+---------------------+----------+------------------+---------------------------+----------------+-------------------------------------------------------+--------------------+----------+------------+
|  1 | Rebecca Chang     | 2025-01-25 00:00:00 | 09:25:33 | Books            | iosborne@gmail.com 

  df_gas_tcloseness = df_gas_ldiversity.groupby('Amount', group_keys=False).apply(lambda x: t_closeness_sample(x, amount_distribution))
