In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import faker

# SETUP

### Faker generates realistic names, IPs, locations, etc.

In [2]:
fake = faker.Faker()

### Simulation parameters

In [3]:
# Number of distinct users
NUM_USERS = 100
# Number of days of logs
DAYS = 30         
MIN_LOGINS_PER_DAY = 2
MAX_LOGINS_PER_DAY = 6
# Simulation start date
START_DATE = datetime(2025, 8, 1)


### Location specification

In [4]:
UK_CITIES = [
    ("London", "GB"), ("Manchester", "GB"), ("Birmingham", "GB"),
    ("Leeds", "GB"), ("Glasgow", "GB"), ("Liverpool", "GB"),
    ("Edinburgh", "GB"), ("Bristol", "GB"), ("Cardiff", "GB")
]

# Non-UK cities for anomalies
NON_UK_CITIES = [
    ("New York", "US"), ("Sydney", "AU"), ("Tokyo", "JP"),
    ("Berlin", "DE"), ("Dubai", "AE"), ("Toronto", "CA"),
    ("Paris", "FR"), ("Singapore", "SG"), ("Johannesburg", "ZA")
]

# AWS regions for realism
AWS_REGIONS = ["eu-west-1", "eu-west-2", "us-east-1", "us-west-2"]

### Other Specifications

In [6]:
# Device types
DEVICE_TYPES = ["Desktop", "Mobile", "Tablet"]

# IAM user types
USER_TYPES = ["IAMUser", "Root", "AssumedRole"]

# Common AWS IAM-related events
EVENT_NAMES = ["ConsoleLogin", "GetObject", "PutObject", "ListBuckets",
               "StartInstances", "StopInstances", "DescribeInstances"]

# AWS resources accessed
RESOURCES = ["S3", "EC2", "DynamoDB", "CloudWatch", "RDS", "IAM"]

# CREATE USERS & DEVICES

In [7]:
users = []
for i in range(NUM_USERS):
    user_id = f"user_{i+1}"
    user_name = fake.user_name()
    user_type = random.choice(USER_TYPES)
    primary_device = random.choice(DEVICE_TYPES)
    primary_location = random.choice(UK_CITIES)
    users.append({
        "user_id": user_id,
        "user_name": user_name,
        "user_type": user_type,
        "primary_device": primary_device,
        "primary_location": primary_location
    })

# GENERATE LOGS

In [8]:
records = []

for user in users:
    for day in range(DAYS):
        num_logins = random.randint(MIN_LOGINS_PER_DAY, MAX_LOGINS_PER_DAY)
        day_date = START_DATE + timedelta(days=day)

        for _ in range(num_logins):
            # Base login time
            login_time = day_date + timedelta(
                hours=random.randint(6, 20),  # normal hours
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )

            # Default = normal login
            location = user["primary_location"]
            device = user["primary_device"]
            mfa_used = random.choice(["Yes", "No"])
            event_name = random.choice(EVENT_NAMES)
            aws_region = random.choice(AWS_REGIONS)
            source_ip = fake.ipv4_public()
            resource_accessed = random.choice(RESOURCES)
            login_result = "Success"
            session_duration = random.randint(5, 120)

            # Anomaly defaults
            is_anomaly = 0
            anomaly_reasons = []

            # ANOMALY INJECTION RULES

            # Rule 1: Unusual location (non-UK)
            if random.random() < 0.03:  # 3% chance
                location = random.choice(NON_UK_CITIES)
                is_anomaly = 1
                anomaly_reasons.append("Unusual location")

            # Rule 2: Odd hours login
            if random.random() < 0.02:
                login_time = day_date + timedelta(
                    hours=random.randint(0, 4),
                    minutes=random.randint(0, 59),
                    seconds=random.randint(0, 59)
                )
                is_anomaly = 1
                anomaly_reasons.append("Odd hours login")

            # Rule 3: First-time device use
            if random.random() < 0.02:
                device = random.choice([d for d in DEVICE_TYPES if d != user["primary_device"]])
                is_anomaly = 1
                anomaly_reasons.append("First-time device")

            # Rule 4: Excessive resource access
            if random.random() < 0.02:
                session_duration = random.randint(180, 300)  # very long session
                is_anomaly = 1
                anomaly_reasons.append("Excessive resource access")

            # Add record
            records.append({
                "eventTime": login_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                "userName": user["user_name"],
                "userType": user["user_type"],
                "eventName": event_name,
                "awsRegion": aws_region,
                "sourceIPAddress": source_ip,
                "location": f"{location[0]}, {location[1]}",
                "deviceType": device,
                "mfaUsed": mfa_used,
                "loginResult": login_result,
                "sessionDuration": session_duration,
                "resourceAccessed": resource_accessed,
                "is_anomaly": is_anomaly,
                "anomaly_reason": ", ".join(anomaly_reasons) if anomaly_reasons else ""
            })

# SAVE TO CSV

In [9]:
df = pd.DataFrame(records)
df.to_csv("iam_cloudtrail_simulation.csv", index=False)

In [10]:
print(f"Dataset generated: {df.shape[0]} rows, saved to iam_cloudtrail_simulation.csv")
print(df.head())

Dataset generated: 11993 rows, saved to iam_cloudtrail_simulation.csv
              eventTime      userName     userType          eventName  \
0  2025-08-01T17:08:51Z  jenniferboyd  AssumedRole          GetObject   
1  2025-08-01T14:35:06Z  jenniferboyd  AssumedRole          PutObject   
2  2025-08-01T13:24:13Z  jenniferboyd  AssumedRole          PutObject   
3  2025-08-01T11:39:51Z  jenniferboyd  AssumedRole  DescribeInstances   
4  2025-08-02T07:35:45Z  jenniferboyd  AssumedRole  DescribeInstances   

   awsRegion  sourceIPAddress        location deviceType mfaUsed loginResult  \
0  eu-west-2   159.79.175.236  Birmingham, GB     Tablet     Yes     Success   
1  us-west-2   59.152.112.241  Birmingham, GB     Tablet      No     Success   
2  eu-west-2    209.4.199.234  Birmingham, GB     Tablet     Yes     Success   
3  eu-west-1    164.70.117.35  Birmingham, GB     Tablet      No     Success   
4  us-west-2  188.139.101.234  Birmingham, GB     Tablet      No     Success   

   session

In [11]:
# Number of anomalies in the dataset
df['is_anomaly'].value_counts()

is_anomaly
0    10942
1     1051
Name: count, dtype: int64

In [12]:
# Percentage of anomalies in the dataset
df['is_anomaly'].value_counts(normalize=True) * 100

is_anomaly
0    91.236555
1     8.763445
Name: proportion, dtype: float64

# IAM
Detection Pipeline

### Import Machine learning libraries

In [13]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [16]:
# 1. Load Dataset
df = pd.read_csv("iam_cloudtrail_simulation.csv")

In [14]:
# Inspect first rows
print("Dataset sample:\n", df.head())

Dataset sample:
               eventTime      userName     userType          eventName  \
0  2025-08-01T17:08:51Z  jenniferboyd  AssumedRole          GetObject   
1  2025-08-01T14:35:06Z  jenniferboyd  AssumedRole          PutObject   
2  2025-08-01T13:24:13Z  jenniferboyd  AssumedRole          PutObject   
3  2025-08-01T11:39:51Z  jenniferboyd  AssumedRole  DescribeInstances   
4  2025-08-02T07:35:45Z  jenniferboyd  AssumedRole  DescribeInstances   

   awsRegion  sourceIPAddress        location deviceType mfaUsed loginResult  \
0  eu-west-2   159.79.175.236  Birmingham, GB     Tablet     Yes     Success   
1  us-west-2   59.152.112.241  Birmingham, GB     Tablet      No     Success   
2  eu-west-2    209.4.199.234  Birmingham, GB     Tablet     Yes     Success   
3  eu-west-1    164.70.117.35  Birmingham, GB     Tablet      No     Success   
4  us-west-2  188.139.101.234  Birmingham, GB     Tablet      No     Success   

   sessionDuration resourceAccessed  is_anomaly anomaly_reason 

### Feature Engineering

In [21]:
# Time features
df['eventTime'] = pd.to_datetime(df['eventTime'], errors='coerce')
df['hour'] = df['eventTime'].dt.hour.fillna(-1)
df['day_of_week'] = df['eventTime'].dt.dayofweek.fillna(-1)

# Binary flags
df['mfaUsed_bin'] = df['mfaUsed'].map({'Yes':1, 'No':0}).fillna(0)
df['loginSuccess'] = df['loginResult'].map({'Success':1, 'Failure':0}).fillna(1)

# Country flag
df['country'] = df['location'].str.split(',').str[-1].str.strip()
df['is_uk'] = df['country'].str.contains("GB|UK|United Kingdom", case=False, na=False).astype(int)

# Encode categorical fields
le_user = LabelEncoder()
df['userName_enc'] = le_user.fit_transform(df['userName'].astype(str))

# Select features 
features = [
    'userName_enc','userType','eventName','awsRegion','deviceType',
    'resourceAccessed','sessionDuration','hour','day_of_week',
    'is_uk','mfaUsed_bin','loginSuccess'
]
X = pd.get_dummies(df[features], drop_first=True)   # one-hot encode categoricals
y = df["is_anomaly"].astype(int)                    # labels (0 = normal, 1 = anomaly)
# Scale numeric features
scaler = StandardScaler()
num_cols = ['sessionDuration','hour','day_of_week']
X[num_cols] = scaler.fit_transform(X[num_cols])

## Model Training

In [22]:
# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- Train Isolation Forest ---
iso = IsolationForest(
    n_estimators=200,
    contamination=max(0.001, y_train.mean()),  # estimate anomaly proportion
    random_state=42,
    n_jobs=-1
)
iso.fit(X_train[y_train==0])   # fit only on normal data

# --- Predict ---
y_pred_if = iso.predict(X_test)   # -1 = anomaly, 1 = normal
y_pred = np.where(y_pred_if == -1, 1, 0)

### Model Evaluation

In [23]:
# --- Evaluate ---
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, -iso.decision_function(X_test))

In [24]:
print("\n=== Evaluation ===")
print("Precision: {:.4f}".format(precision))
print("Recall:    {:.4f}".format(recall))
print("F1-score:  {:.4f}".format(f1))
print("ROC-AUC:   {:.4f}".format(roc_auc))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== Evaluation ===
Precision: 0.1554
Recall:    0.1746
F1-score:  0.1644
ROC-AUC:   0.5766

Confusion Matrix:
 [[2984  299]
 [ 260   55]]
