In [2]:
import pandas as pd

# List of CSV Files Containing Users
csv_files = ["processed_logon.csv", "processed_device.csv", "processed_http.csv", "final_email.csv", "processed_file.csv"]

# Extract Unique Users from All CSVs
all_users = set()
for file in csv_files:
    df = pd.read_csv(file, usecols=["user"])  # Load only the 'user' column
    all_users.update(df["user"].unique())  # Add unique users to the set

# Convert to Sorted List
all_users = sorted(list(all_users))

print(f"✅ Found {len(all_users)} unique users in the dataset.")


✅ Found 1000 unique users in the dataset.


In [3]:
# 📌 Replace this list with your real anomalous users (Example List)
anomalous_users = [
    "AAM0658", "AJR0932", "BDV0168", "BIH0745", "BLS0678", "BTL0226", "CAH0936",
    "DCH0843", "EHB0824", "EHD0584", "FMG0527", "FTM0406", "GHL0460", "HJB0742",
    "JMB0308", "JRG0207", "KLH0596", "KPC0073", "LJR0523", "LQC0479", "MAR0955",
    "MAS0025", "MCF0600", "MYD0978", "PPF0435", "RAB0589", "RGG0064", "RKD0604",
    "TAP0551", "WDD0366", "AAF0535", "ABC0174", "AKR0057", "CCL0068", "CEJ0109",
    "CQW0652", "DIB0285", "DRR0162", "EDB0714", "EGD0132", "FSC0601", "HBO0413",
    "HXL0968", "IJM0776", "IKR0401", "IUB0565", "JJM0203", "KRL0501", "LCC0819",
    "MDH0580", "MOS0047", "NWT0098", "PNL0301", "PSF0133", "RAR0725", "RHL0992",
    "RMW0542", "TNM0961", "VSS0154", "XHW0498", "BBS0039", "BSS0369", "CCA0046",
    "CSC0217", "GTD0219", "JGT0221", "JLM0364", "JTM0223", "MPM0220", "MSO0222"
]




# Extract Benign Users (Users NOT in the Anomalous List)
benign_users = [user for user in all_users if user not in anomalous_users]

# Downsample Benign Users to Keep Only 350
benign_downsampled = pd.Series(benign_users).sample(n=350, random_state=42).tolist()

print(f"✅ Keeping {len(anomalous_users)} anomalous users and {len(benign_downsampled)} benign users")


✅ Keeping 70 anomalous users and 350 benign users


In [4]:
# Users to Keep (Anomalous Users + Selected Benign Users)
selected_users = set(anomalous_users).union(set(benign_downsampled))

# Downsample Each CSV
for file in csv_files:
    print(f"🔹 Processing {file}...")

    # Load Data
    df = pd.read_csv(file)

    # Filter to Keep Only Selected Users
    df = df[df["user"].isin(selected_users)]

    # Save Downsampled CSV
    df.to_csv(f"downsampled_{file}", index=False)

    print(f"✅ Saved downsampled version: downsampled_{file}")

print("🎉 All CSVs downsampled successfully!")


🔹 Processing processed_logon.csv...
✅ Saved downsampled version: downsampled_processed_logon.csv
🔹 Processing processed_device.csv...
✅ Saved downsampled version: downsampled_processed_device.csv
🔹 Processing processed_http.csv...
✅ Saved downsampled version: downsampled_processed_http.csv
🔹 Processing final_email.csv...
✅ Saved downsampled version: downsampled_final_email.csv
🔹 Processing processed_file.csv...
✅ Saved downsampled version: downsampled_processed_file.csv
🎉 All CSVs downsampled successfully!


In [5]:
# Save Selected Users to a CSV for Reference
downsampled_users_df = pd.DataFrame({"user": list(selected_users)})
downsampled_users_df.to_csv("downsampled_users.csv", index=False)

print(f"✅ Saved list of downsampled users: downsampled_users.csv")


✅ Saved list of downsampled users: downsampled_users.csv
