## Importing all the relevant libraries
Imports all necessary libraries: pandas and numpy for data, random for randomness, Faker for fake data (like cities), and datetime for timestamp generation.

In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

## Initialize Faker and parameters
Faker is the tool of choice for the simulation of the dataset.

100 users have been used in the simulation, over a 30 days period of activity, with each user randomly logging in between 2 and 6 times daily. 

Resources have been added to add an extra layer to the anomaly detection.

In [14]:
# Initialize Faker with UK locale
fake = Faker('en_GB')

# Configuration
# Simulate 100 unique users
num_users = 100
# Start simulation from June 1st
start_date = datetime(2025, 6, 1)
# Simulate over 30 days
days = 30
# Minimum logins per user per day
min_logins_per_day = 2
# Minimum logins per user per day
max_logins_per_day = 6

# List of simulated business resources
resources = [
    'HR Portal', 'Finance System', 'Email', 'File Server', 
    'CRM', 'DevOps Dashboard'
]

## Location Configuration
It is assumed that the users are based in the UK and so locations outside of the UK may be considered anomalous. This adds an extra layer to the anomaly detection.

In [22]:
# UK city list (for normal behavior)
uk_locations = [
    'London', 'Birmingham', 'Manchester', 'Leeds', 'Glasgow', 'Liverpool',
    'Sheffield', 'Bristol', 'Cardiff', 'Edinburgh', 'Nottingham', 'Leicester',
    'Southampton', 'Newcastle', 'Coventry', 'Reading', 'Dundee', 'Aberdeen',
    'Brighton', 'Swansea'
]

# Non-UK city list (for anomaly injection)
non_uk_locations = [
    'New York', 'Berlin', 'Paris', 'Tokyo', 'Dubai', 'Lagos',
    'Mumbai', 'Moscow', 'Toronto', 'San Francisco'
]

## Generate Users and Devices

In [24]:
# Generate users and devices
# user_001 to user_100
users = [f'user_{i:03d}' for i in range(1, num_users + 1)]  
# 149 unique devices
devices = [f'DEV-{i:03d}' for i in range(1, 150)]                

## Create the logs

In [25]:
# Prepare to collect records
records = []

# Simulate login activity for each user
for user in users:
    base_location = random.choice(uk_locations)     # Normal login location
    base_device = random.choice(devices)            # Normal login device
    
    for day in range(days):
        current_date = start_date + timedelta(days=day)
        logins_today = random.randint(min_logins_per_day, max_logins_per_day)
        
        for _ in range(logins_today):
            # Generate a standard login time (between 6 AM and 10 PM)
            login_time = current_date + timedelta(
                hours=random.randint(6, 22),
                minutes=random.randint(0, 59)
            )

            # Default values (non-anomalous)
            is_anomaly = 0
            location = base_location
            device = base_device
            accessed_resource = random.choice(resources)

            # Introduce anomalies (~5% chance)
            if random.random() < 0.05:
                is_anomaly = 1
                anomaly_type = random.choice([
                    'odd_hour', 'new_location', 'new_device', 'unusual_resource'
                ])

                # Apply anomaly conditions
                if anomaly_type == 'odd_hour':
                    login_time = current_date + timedelta(hours=random.randint(0, 4))

                elif anomaly_type == 'new_location':
                    location = random.choice(non_uk_locations)

                elif anomaly_type == 'new_device':
                    device = random.choice([dev for dev in devices if dev != base_device])

                elif anomaly_type == 'unusual_resource':
                    accessed_resource = 'Confidential Archives'

            # Append this login event
            records.append({
                'user_id': user,
                'timestamp': login_time.strftime('%Y-%m-%d %H:%M:%S'),
                'location': location,
                'device_id': device,
                'resource_accessed': accessed_resource,
                'login_result': 'success',
                'session_duration_min': random.randint(5, 120),
                'is_anomaly': is_anomaly
            })


## Create DataFrame

In [26]:
df = pd.DataFrame(records)

## Sort by time

In [27]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by='timestamp').reset_index(drop=True)

## View the simulated dataset

In [28]:
df.head()

Unnamed: 0,user_id,timestamp,location,device_id,resource_accessed,login_result,session_duration_min,is_anomaly
0,user_056,2025-06-01 00:00:00,Nottingham,DEV-149,Finance System,success,9,1
1,user_077,2025-06-01 03:00:00,Glasgow,DEV-026,HR Portal,success,39,1
2,user_001,2025-06-01 04:00:00,Newcastle,DEV-023,Finance System,success,20,1
3,user_088,2025-06-01 06:01:00,Aberdeen,DEV-073,Email,success,56,0
4,user_023,2025-06-01 06:02:00,Aberdeen,DEV-080,CRM,success,114,0


In [29]:
df

Unnamed: 0,user_id,timestamp,location,device_id,resource_accessed,login_result,session_duration_min,is_anomaly
0,user_056,2025-06-01 00:00:00,Nottingham,DEV-149,Finance System,success,9,1
1,user_077,2025-06-01 03:00:00,Glasgow,DEV-026,HR Portal,success,39,1
2,user_001,2025-06-01 04:00:00,Newcastle,DEV-023,Finance System,success,20,1
3,user_088,2025-06-01 06:01:00,Aberdeen,DEV-073,Email,success,56,0
4,user_023,2025-06-01 06:02:00,Aberdeen,DEV-080,CRM,success,114,0
...,...,...,...,...,...,...,...,...
11980,user_053,2025-06-30 22:52:00,Birmingham,DEV-146,Finance System,success,39,0
11981,user_042,2025-06-30 22:53:00,Birmingham,DEV-126,Email,success,84,0
11982,user_034,2025-06-30 22:53:00,Newcastle,DEV-050,File Server,success,81,0
11983,user_091,2025-06-30 22:54:00,Swansea,DEV-126,File Server,success,27,0


In [30]:
df['is_anomaly'].value_counts()

is_anomaly
0    11421
1      564
Name: count, dtype: int64

In [32]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

user_id: 100 unique values
timestamp: 9902 unique values
location: 30 unique values
device_id: 110 unique values
resource_accessed: 7 unique values
login_result: 1 unique values
session_duration_min: 116 unique values
is_anomaly: 2 unique values


## Convert to csv file

In [34]:
# Save the DataFrame to a CSV file
df.to_csv('simulated_iam_data.csv', index=False)
