# MENA Safe Data Generation Notebook

This Jupyter Notebook generates synthetic data for the **MENA Safe** crime and incident reporting platform.
It creates datasets for **Users, Alerts, and Interactions**, ensuring realistic distributions and relationships.

**Key Features:**
- Generates **1500 Users**, **600 Alerts (Cairo & Abu Dhabi)**, and **5000 Interactions**.
- Ensures **location clustering** for DBSCAN detection.
- Maintains **valid foreign key relationships** between tables.
- Provides **crime-related** titles and descriptions for alerts.



In [24]:
import pandas as pd
import random
import uuid
from faker import Faker

# Initialize Faker
fake = Faker()


In [25]:
# Generate Users Data
num_users = 800
users_data = []

for _ in range(num_users):
    user_id = str(uuid.uuid4())  # Unique user ID
    email = fake.email()
    users_data.append([user_id, email])

# Create Users DataFrame
users_df = pd.DataFrame(users_data, columns=["User ID", "Email"])

# Display sample
users_df.to_csv('Data/users.csv', index=False)
users_df.head()


Unnamed: 0,User ID,Email
0,40be3738-31b3-422d-ba52-11dddf4e764f,zjones@example.com
1,862d0bf0-d432-4245-b90d-b63154d638c7,taylortina@example.org
2,6cfce00f-6058-49de-a3f3-cb92198c0af1,andersonstephanie@example.com
3,32553597-e1a8-41b9-a7e6-baf3c33c6420,cartertina@example.org
4,3b8c7006-4934-4464-8bfd-1bc94b637a49,josephmontes@example.net


In [26]:
# Define city distributions
cities = {"Cairo": 200, "Abu Dhabi": 50}

# Define city locations (latitude and longitude ranges)
city_locations = {
    "Cairo": [(30.037, 30.065), (31.200, 31.270)],
    "Abu Dhabi": [(24.425, 24.500), (54.350, 54.400)]
}

# Crime-related titles and descriptions
crime_titles = [
    "Robbery at local store", "Suspicious activity near park", "Car theft reported",
    "Vandalism on public property", "Home burglary attempt", "Fraudulent transaction alert",
    "Missing person reported", "Assault in busy market", "Illegal drug activity",
    "Pickpocket incident in subway", "Suspicious vehicle loitering", "Firearm discharge reported",
    "Violent altercation in public", "Scam phone calls targeting elderly", "ATM skimming device found"
]

crime_descriptions = [
    "Eyewitness reports seeing a person breaking into a store and taking cash.",
    "A suspicious individual was seen hanging around the children's playground for an extended period.",
    "A vehicle was stolen from the parking lot in broad daylight.",
    "Several walls and benches were vandalized with graffiti overnight.",
    "A resident reported an attempted break-in through the back door.",
    "A fraudulent transaction was detected at a major bank branch.",
    "Family members report a missing individual last seen near downtown.",
    "A violent altercation broke out at a market, causing injuries to multiple people.",
    "Authorities suspect illegal drug trade in an abandoned building.",
    "A pickpocket was caught on CCTV stealing wallets from passengers in the subway.",
    "A suspicious vehicle has been seen parked in the same location for hours.",
    "Gunshots were reported in the area; authorities are investigating.",
    "A brawl involving multiple individuals occurred in a crowded public space.",
    "Scammers have been targeting elderly residents with fake phone calls demanding money.",
    "An ATM skimming device was found attached to a machine in a busy financial district."
]

# Generate Alerts Data
alerts_data = []
for city, num_alerts in cities.items():
    lat_range, lon_range = city_locations[city]
    
    for _ in range(num_alerts):
        alert_id = str(uuid.uuid4())
        user_id = random.choice(users_df["User ID"].tolist())
        title = random.choice(crime_titles)
        description = random.choice(crime_descriptions)
        entry_datetime = fake.date_time_this_year()
        event_datetime = fake.date_time_between(start_date="-1y", end_date=entry_datetime)
        latitude = round(random.uniform(*lat_range), 6)
        longitude = round(random.uniform(*lon_range), 6)
        num_affirmatives = random.randint(0, 50)
        num_responds = num_affirmatives + random.randint(0, 20)
        
        alerts_data.append([
            alert_id, user_id, title, entry_datetime, event_datetime,
            description, latitude, longitude, city, num_affirmatives, num_responds
        ])

# Create Alerts DataFrame
alerts_df = pd.DataFrame(alerts_data, columns=[
    "Alert ID", "User ID", "Title", "Entry Date Time", "Event Date Time",
    "Description", "Latitude", "Longitude", "City", "Num Affirmatives", "Num Responds"
])



# Display sample
alerts_df.to_csv('Data/alerts.csv', index=False)
alerts_df.head()


Unnamed: 0,Alert ID,User ID,Title,Entry Date Time,Event Date Time,Description,Latitude,Longitude,City,Num Affirmatives,Num Responds
0,d6114146-192e-410a-9513-2c9c71736832,8cf9ef37-5c0f-4d8b-aa89-44bbe26e5ac0,Assault in busy market,2025-01-27 17:14:45.468211,2025-01-05 12:53:04.960805,A resident reported an attempted break-in thro...,30.037186,31.221344,Cairo,3,19
1,da87c183-aef2-4ed2-8623-056b34d071fe,a8e3aa27-1829-4aeb-aef7-b0b283a97f75,Illegal drug activity,2025-01-25 13:22:07.854068,2024-09-30 21:22:38.399233,A resident reported an attempted break-in thro...,30.062123,31.257263,Cairo,31,35
2,e364c45e-08a5-4d91-b8d3-1e0fc8f93696,f5698696-3c64-4b3a-8eec-b528b293f9b8,Home burglary attempt,2025-01-29 15:39:06.166261,2025-01-18 00:28:29.560473,Eyewitness reports seeing a person breaking in...,30.059548,31.212546,Cairo,50,50
3,cf55cf93-68f8-4eee-a4fb-fd6deacabd16,e0ace32f-0368-44c2-8998-132dcd84cfce,Home burglary attempt,2025-01-22 17:03:13.394264,2024-09-24 01:26:08.955150,A fraudulent transaction was detected at a maj...,30.058783,31.217377,Cairo,39,44
4,5152ca9b-d2fb-4b8e-ad38-bc24105e9258,32e364c5-b4cd-4ebb-8d8b-38646ce65812,ATM skimming device found,2025-01-24 18:48:03.441404,2025-01-01 00:43:05.145341,Family members report a missing individual las...,30.054465,31.251026,Cairo,6,11


In [27]:
# Regenerate Interactions Data with Consistency Check
num_interactions = 3000

# Initialize a dictionary to track interaction counts per alert
alert_interactions = {alert_id: {"num_responds": 0, "num_affirmatives": 0} for alert_id in alerts_df["Alert ID"].tolist()}

# Generate Interactions Data
interactions_data = []

for _ in range(num_interactions):
    alert_id = random.choice(alerts_df["Alert ID"].tolist())  # Ensure valid alert
    user_id = random.choice(users_df["User ID"].tolist())  # Ensure valid user
    time_of_reaction = fake.date_time_between(start_date="-1y", end_date="now")
    affirmative = random.choice([True, False])

    # Update interaction counts per alert
    alert_interactions[alert_id]["num_responds"] += 1
    if affirmative:
        alert_interactions[alert_id]["num_affirmatives"] += 1

    interactions_data.append([str(uuid.uuid4()), user_id, alert_id, time_of_reaction, affirmative])

# Create Interactions DataFrame
interactions_df = pd.DataFrame(interactions_data, columns=[
    "Interaction ID", "User ID", "Alert ID", "Time of Reaction", "Affirmative"
])

# Update Alerts Table with correct interaction counts
alerts_df["Num Affirmatives"] = alerts_df["Alert ID"].map(lambda x: alert_interactions[x]["num_affirmatives"])
alerts_df["Num Responds"] = alerts_df["Alert ID"].map(lambda x: alert_interactions[x]["num_responds"])


# Display sample
interactions_df.to_csv('Data/interactions.csv', index=False)
interactions_df.head()

Unnamed: 0,Interaction ID,User ID,Alert ID,Time of Reaction,Affirmative
0,1c64cd72-78e6-4ab6-80c7-0967507bc4c4,64b39bbc-e5f5-4684-ad7b-06256a5fdf97,c766f95f-bd15-4715-8d7b-f32dd95b5e7b,2024-12-30 19:07:38.140771,True
1,045284be-cd7b-47a6-954b-22fc14081b43,82a4e373-2af1-4f84-8d01-d4eabb8d699a,860cff2b-8c9e-4f6b-8d55-3cbd264b8582,2024-05-12 22:20:55.947089,False
2,881b7408-4144-4b70-8293-ec5c59af137d,2097efde-8ce9-4f7e-8291-ebd48ec64ce1,dca6d39f-6bf5-4938-8736-091bd0a0b74a,2024-03-02 21:27:38.450894,False
3,a90e6f1b-768c-4387-b470-0254f1362828,c4914a71-f01e-4241-8018-4d967e5bac13,47bb54a7-8012-4bea-91ae-b561b4e398ac,2024-09-15 14:41:24.303363,True
4,1b0e15da-59e9-4be0-ad60-ce446c6dec6c,2800c8f6-578c-4691-8279-fb338da752c0,ab85999c-156a-48e2-940e-7e2d2365732d,2024-09-30 23:37:35.402692,True


In [28]:
# Validate that alert counts match interaction counts

# Count the number of interactions and affirmatives per alert in the Interactions table
interaction_counts = interactions_df.groupby("Alert ID").agg(
    Num_Responds=("Interaction ID", "count"),
    Num_Affirmatives=("Affirmative", "sum")
).reset_index()

# Merge with Alerts Table
merged_alerts_df = alerts_df.merge(interaction_counts, on="Alert ID", how="left")

# Check for inconsistencies
inconsistent_alerts = merged_alerts_df[
    (merged_alerts_df["Num Responds"] != merged_alerts_df["Num_Responds"]) |
    (merged_alerts_df["Num Affirmatives"] != merged_alerts_df["Num_Affirmatives"])
]

if not inconsistent_alerts.empty:
    print("⚠️ Inconsistencies found in alert interaction counts!")
    display(inconsistent_alerts.shape, alerts_df.shape)
else:
    print("✅ All alert counts are consistent with interactions.")


✅ All alert counts are consistent with interactions.
