# IAM Data Exploration

This notebook explores the synthetic IAM access logs generated for AI Access Sentinel.

## Objectives
1. Load and examine the IAM dataset
2. Perform exploratory data analysis (EDA)
3. Visualize access patterns
4. Identify anomalies in the data
5. Understand feature distributions

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add parent directory to path
sys.path.insert(0, os.path.abspath('..'))

from src.data.generators import IAMDataGenerator
from src.data.preprocessors import IAMDataPreprocessor
from src.utils.visualization import *

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Generate Synthetic Data

First, let's generate synthetic IAM access logs if they don't exist.

In [None]:
# Generate data
data_path = '../data/sample_iam_logs.csv'

if not os.path.exists(data_path):
    print("Generating synthetic IAM data...")
    generator = IAMDataGenerator()
    df = generator.generate_complete_dataset(
        num_users=200,
        normal_events_per_user=50,
        anomaly_ratio=0.05,
        output_path=data_path
    )
else:
    print("Loading existing data...")
    df = pd.read_csv(data_path)

print(f"\nDataset shape: {df.shape}")
print(f"Total events: {len(df):,}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

## 2. Data Overview

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data info
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

print("\nMissing value percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))

## 3. Preprocess Data

In [None]:
# Preprocess data
preprocessor = IAMDataPreprocessor()
df_processed = preprocessor.preprocess_for_training(df)

print(f"Processed dataset shape: {df_processed.shape}")
print(f"\nNew columns added: {set(df_processed.columns) - set(df.columns)}")

## 4. Exploratory Data Analysis

### 4.1 User Analysis

In [None]:
# Unique users
print(f"Total unique users: {df['user_id'].nunique()}")
print(f"\nEvents per user:")
print(df.groupby('user_id').size().describe())

In [None]:
# Department distribution
dept_counts = df['department'].value_counts()

plt.figure(figsize=(12, 6))
dept_counts.plot(kind='bar', color='steelblue')
plt.title('Events by Department', fontsize=14)
plt.xlabel('Department')
plt.ylabel('Event Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 4.2 Resource Analysis

In [None]:
# Most accessed resources
resource_counts = df['resource'].value_counts().head(15)

plt.figure(figsize=(12, 8))
plt.barh(range(len(resource_counts)), resource_counts.values, color='coral')
plt.yticks(range(len(resource_counts)), resource_counts.index)
plt.xlabel('Access Count')
plt.title('Top 15 Most Accessed Resources', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"\nTotal unique resources: {df['resource'].nunique()}")

### 4.3 Action Analysis

In [None]:
# Action distribution
action_counts = df['action'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
action_counts.plot(kind='bar', ax=ax1, color='teal')
ax1.set_title('Action Distribution (Bar Chart)')
ax1.set_xlabel('Action')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
ax2.pie(action_counts.values, labels=action_counts.index, autopct='%1.1f%%', startangle=90)
ax2.set_title('Action Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

### 4.4 Temporal Analysis

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract time components
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Access patterns over time
daily_counts = df.groupby('date').size()

plt.figure(figsize=(14, 6))
plt.plot(daily_counts.index, daily_counts.values, marker='o', linewidth=2)
plt.title('Daily Access Events Over Time', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Event Count')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Hourly distribution
hourly_counts = df['hour'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.bar(hourly_counts.index, hourly_counts.values, color='skyblue', edgecolor='black')
plt.axvspan(9, 17, alpha=0.2, color='green', label='Business Hours')
plt.title('Access Events by Hour of Day', fontsize=14)
plt.xlabel('Hour')
plt.ylabel('Event Count')
plt.xticks(range(24))
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 4.5 Anomaly Analysis

In [None]:
# Anomaly statistics
anomaly_count = df['is_anomaly'].sum()
anomaly_ratio = anomaly_count / len(df) * 100

print(f"Total anomalies: {anomaly_count:,}")
print(f"Anomaly ratio: {anomaly_ratio:.2f}%")
print(f"\nAnomaly types:")
print(df[df['is_anomaly']]['anomaly_type'].value_counts())

In [None]:
# Anomaly type distribution
anomaly_types = df[df['is_anomaly']]['anomaly_type'].value_counts()

plt.figure(figsize=(10, 6))
plt.barh(range(len(anomaly_types)), anomaly_types.values, color='crimson')
plt.yticks(range(len(anomaly_types)), anomaly_types.index)
plt.xlabel('Count')
plt.title('Distribution of Anomaly Types', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 4.6 Success Rate Analysis

In [None]:
# Overall success rate
success_rate = df['success'].mean() * 100
print(f"Overall success rate: {success_rate:.2f}%")

# Success rate by anomaly status
print("\nSuccess rate by anomaly status:")
print(df.groupby('is_anomaly')['success'].mean() * 100)

### 4.7 Location Analysis

In [None]:
# Top locations
location_counts = df['location'].value_counts().head(15)

plt.figure(figsize=(12, 6))
location_counts.plot(kind='bar', color='orchid')
plt.title('Top 15 Access Locations', fontsize=14)
plt.xlabel('Location')
plt.ylabel('Event Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Key Insights

Summarize key findings from the exploration:

In [None]:
insights = {
    'Total Events': len(df),
    'Total Users': df['user_id'].nunique(),
    'Total Resources': df['resource'].nunique(),
    'Anomaly Rate': f"{anomaly_ratio:.2f}%",
    'Success Rate': f"{success_rate:.2f}%",
    'Most Active Department': df['department'].value_counts().index[0],
    'Most Accessed Resource': df['resource'].value_counts().index[0],
    'Most Common Action': df['action'].value_counts().index[0],
    'Peak Hour': df['hour'].value_counts().index[0]
}

print("=" * 50)
print("KEY INSIGHTS")
print("=" * 50)
for key, value in insights.items():
    print(f"{key:.<30} {value}")
print("=" * 50)

## Next Steps

1. **Feature Engineering**: Create additional features for ML models
2. **Model Training**: Train anomaly detection models
3. **Evaluation**: Assess model performance
4. **Deployment**: Integrate into production pipeline

See the following notebooks:
- `02_feature_engineering.ipynb`
- `03_model_training.ipynb`
- `04_anomaly_detection.ipynb`
- `05_role_mining.ipynb`